Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
9729edac
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9729edac
编写于
12月 15, 2018
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Support feed multi inputs and fetch multi outputs
上级
f20c9041
变更
24
隐藏空白更改
内联
并排
Showing
24 changed file
with
634 addition
and
587 deletion
+634
-587
src/framework/executor.cpp
src/framework/executor.cpp
+207
-290
src/framework/executor.h
src/framework/executor.h
+38
-49
src/framework/loader.cpp
src/framework/loader.cpp
+30
-40
src/framework/loader.h
src/framework/loader.h
+21
-21
src/framework/lod_tensor.h
src/framework/lod_tensor.h
+27
-3
src/framework/program/program.h
src/framework/program/program.h
+2
-3
src/framework/scope.h
src/framework/scope.h
+1
-0
src/framework/tensor.h
src/framework/tensor.h
+0
-1
src/io/api_paddle_mobile.cc
src/io/api_paddle_mobile.cc
+22
-22
src/io/api_paddle_mobile.h
src/io/api_paddle_mobile.h
+2
-2
src/io/ios_io/PaddleMobileCPU.mm
src/io/ios_io/PaddleMobileCPU.mm
+4
-3
src/io/jni/paddle_mobile_jni.cpp
src/io/jni/paddle_mobile_jni.cpp
+14
-7
src/io/paddle_mobile.cpp
src/io/paddle_mobile.cpp
+108
-77
src/io/paddle_mobile.h
src/io/paddle_mobile.h
+32
-22
src/io/paddle_test_inference_api.cpp
src/io/paddle_test_inference_api.cpp
+9
-7
src/io/paddle_test_inference_api.h
src/io/paddle_test_inference_api.h
+4
-1
test/CMakeLists.txt
test/CMakeLists.txt
+4
-1
test/executor_for_test.h
test/executor_for_test.h
+20
-27
test/net/test_benchmark.cpp
test/net/test_benchmark.cpp
+3
-2
test/net/test_eng.cpp
test/net/test_eng.cpp
+2
-2
test/net/test_googlenet.cpp
test/net/test_googlenet.cpp
+2
-2
test/net/test_nlp.cpp
test/net/test_nlp.cpp
+4
-4
test/net/test_ocr.cpp
test/net/test_ocr.cpp
+77
-0
tools/pre-commit.hooks/cpplint.hook
tools/pre-commit.hooks/cpplint.hook
+1
-1
未找到文件。
src/framework/executor.cpp
浏览文件 @
9729edac
...
...
@@ -28,11 +28,6 @@ limitations under the License. */
#include "framework/tensor.h"
#include "memory/t_malloc.h"
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue>
#include "common/threadpool.h"
#endif
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
...
...
@@ -40,66 +35,67 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
framework
{
using
framework
::
Variable
;
using
framework
::
Variable
;
#pragma mark - executor
template
<
typename
D
type
,
Precision
P
>
Executor
<
D
type
,
P
>::
Executor
(
const
framework
::
Program
<
Dtype
>
p
,
int
batch_size
,
const
bool
use_optimize
,
const
bool
loddabl
e
)
:
program_
(
p
),
template
<
typename
D
evice
,
typename
T
>
Executor
<
D
evice
,
T
>::
Executor
(
const
Program
<
Device
>
&
program
,
int
batch_size
,
const
bool
use_optimize
,
const
bool
lod_mod
e
)
:
program_
(
p
rogram
),
batch_size_
(
batch_size
),
use_optimize_
(
use_optimize
),
loddable_
(
loddable
)
{
lod_mode_
(
lod_mode
)
{
DLOG
<<
"executor in lod mode: "
<<
lod_mode_
;
Variable
*
variable_ptr
=
program_
.
scope
->
Var
(
"batch_size"
);
variable_ptr
->
SetValue
<
int
>
(
batch_size
);
to_predict_program_
=
program_desc_
=
use_optimize_
?
program_
.
optimizeProgram
:
program_
.
originProgram
;
PADDLE_MOBILE_ENFORCE
(
to_predict_program
_
!=
nullptr
,
"
to_predict_program_ == NULL!
"
);
const
std
::
vector
<
std
::
shared_ptr
<
framework
::
BlockDesc
>>
&
blocks
=
to_predict_program_
->
Blocks
(
);
PADDLE_MOBILE_ENFORCE
(
program_desc
_
!=
nullptr
,
"
program_desc_ should not be nullptr
"
);
const
auto
&
blocks
=
program_desc_
->
Blocks
();
ops_of_block_
.
resize
(
blocks
.
size
()
);
DLOG
<<
"executor in loaddable mode: "
<<
loddable_
;
for
(
int
i
=
0
;
i
<
blocks
.
size
();
++
i
)
{
std
::
shared_ptr
<
framework
::
BlockDesc
>
block_desc
=
blocks
[
i
];
std
::
vector
<
std
::
shared_ptr
<
framework
::
OpDesc
>>
ops
=
block_desc
->
Ops
();
std
::
shared_ptr
<
BlockDesc
>
block_desc
=
blocks
[
i
];
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block_desc
->
Ops
();
for
(
int
j
=
0
;
j
<
ops
.
size
();
++
j
)
{
std
::
shared_ptr
<
framework
::
OpDesc
>
op
=
ops
[
j
];
DLOG
<<
"create op: "
<<
op
->
Type
();
auto
op_
base
=
framework
::
OpRegistry
<
Dtyp
e
>::
CreateOp
(
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
op
->
GetAttrMap
(),
program_
.
scope
);
// infer shape to reshape
tensor
before predict,
// but for lod
tensor, it will still need to re
shape in runtime
if
(
!
lod
dable_
)
{
op_
base
->
InferShape
();
std
::
shared_ptr
<
OpDesc
>
op_desc
=
ops
[
j
];
DLOG
<<
"create op: "
<<
op
_desc
->
Type
();
auto
op_
handler
=
OpRegistry
<
Devic
e
>::
CreateOp
(
op
_desc
->
Type
(),
op_desc
->
GetInputs
(),
op_desc
->
GetOutputs
(),
op_desc
->
GetAttrMap
(),
program_
.
scope
);
// infer shape to reshape
inputs and outputs
before predict,
// but for lod
mode, it still need to infer
shape in runtime
if
(
!
lod
_mode
)
{
op_
handler
->
InferShape
();
}
ops_of_block_
[
*
block_desc
.
get
()].
push_back
(
op_base
);
ops_of_block_
[
i
].
push_back
(
op_handler
);
}
}
if
(
program_
.
combined
)
{
InitCombineMemory
();
}
else
{
InitMemory
();
}
std
::
shared_ptr
<
framework
::
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
0
);
int
i
=
0
;
auto
&
ops
=
ops_of_block_
[
*
to_predict_block
.
get
()];
for
(
const
auto
&
op
:
ops
)
{
DLOG
<<
"Initialize op["
<<
i
++
<<
"]: "
<<
op
->
Type
();
op
->
Init
();
int
count
=
0
;
for
(
int
block_id
=
0
;
block_id
<
ops_of_block_
.
size
();
++
block_id
)
{
for
(
auto
&
op_handler
:
ops_of_block_
[
block_id
])
{
DLOG
<<
"Initialize op["
<<
count
++
<<
"]: "
<<
op_handler
->
Type
();
op_handler
->
Init
();
ops_list_
.
push_back
(
op_handler
);
}
}
}
template
<
typename
D
typ
e
>
static
void
LoadMemInternal
(
void
**
data
,
framework
::
LoDTensor
*
tensor
,
template
<
typename
D
evic
e
>
static
void
LoadMemInternal
(
void
**
data
,
LoDTensor
*
tensor
,
bool
quant_uint8
=
false
)
{
char
**
data_buf
=
reinterpret_cast
<
char
**>
(
data
);
int64_t
size
=
tensor
->
numel
();
D
type
*
tensor_data
=
tensor
->
mutable_data
<
Dtyp
e
>
();
D
evice
*
tensor_data
=
tensor
->
mutable_data
<
Devic
e
>
();
if
(
quant_uint8
)
{
// should be moved into operator init function
float
min_value
;
...
...
@@ -114,15 +110,15 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
}
data_buf
+=
size
*
sizeof
(
uint8_t
);
}
else
{
memory
::
Copy
(
tensor_data
,
*
data_buf
,
size
*
sizeof
(
D
typ
e
));
*
data_buf
+=
size
*
sizeof
(
D
typ
e
);
memory
::
Copy
(
tensor_data
,
*
data_buf
,
size
*
sizeof
(
D
evic
e
));
*
data_buf
+=
size
*
sizeof
(
D
evic
e
);
}
}
template
<
typename
D
type
,
Precision
P
>
void
Executor
<
D
type
,
P
>::
LoadMemory
(
void
**
data
,
const
std
::
shared_ptr
<
framework
::
VarDesc
>
var_desc
,
framework
::
LoDTensor
*
tensor
)
{
template
<
typename
D
evice
,
typename
T
>
void
Executor
<
D
evice
,
T
>::
LoadMemory
(
void
**
data
,
const
std
::
shared_ptr
<
VarDesc
>
var_desc
,
LoDTensor
*
tensor
)
{
char
**
data_buf
=
reinterpret_cast
<
char
**>
(
data
);
// version
uint32_t
version
=
*
(
reinterpret_cast
<
uint32_t
*>
(
*
data_buf
));
...
...
@@ -152,18 +148,18 @@ void Executor<Dtype, P>::LoadMemory(
// skip tensor desc
*
data_buf
+=
tensor_desc_size
;
const
framework
::
TensorDesc
&
tensor_desc
=
var_desc
->
Tensor_desc
();
tensor
->
Resize
(
framework
::
make_ddim
(
tensor_desc
.
Dims
()));
const
TensorDesc
&
tensor_desc
=
var_desc
->
Tensor_desc
();
tensor
->
Resize
(
make_ddim
(
tensor_desc
.
Dims
()));
// parse tensor from stream
switch
(
tensor_desc
.
DataType
())
{
case
framework
::
VARTYPE_TYPE_FP32
:
case
VARTYPE_TYPE_FP32
:
LoadMemInternal
<
float
>
(
reinterpret_cast
<
void
**>
(
data_buf
),
tensor
,
program_
.
quantification
);
break
;
case
framework
::
VARTYPE_TYPE_INT8
:
case
VARTYPE_TYPE_INT8
:
LoadMemInternal
<
int8_t
>
(
reinterpret_cast
<
void
**>
(
data_buf
),
tensor
);
break
;
case
framework
::
VARTYPE_TYPE_INT32
:
case
VARTYPE_TYPE_INT32
:
LoadMemInternal
<
int
>
(
reinterpret_cast
<
void
**>
(
data_buf
),
tensor
);
break
;
default:
...
...
@@ -171,12 +167,12 @@ void Executor<Dtype, P>::LoadMemory(
}
}
template
<
typename
D
type
,
Precision
P
>
void
Executor
<
D
type
,
P
>::
InitMemory
()
{
for
(
const
auto
&
block
:
to_predict_program
_
->
Blocks
())
{
template
<
typename
D
evice
,
typename
T
>
void
Executor
<
D
evice
,
T
>::
InitMemory
()
{
for
(
const
auto
&
block
:
program_desc
_
->
Blocks
())
{
for
(
const
auto
&
var_desc
:
block
->
Vars
())
{
auto
var
=
program_
.
scope
->
Var
(
var_desc
->
Name
());
auto
tensor
=
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
auto
tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
if
(
var_desc
->
Persistable
())
{
if
(
var_desc
->
Name
()
==
"feed"
||
var_desc
->
Name
()
==
"fetch"
)
{
continue
;
...
...
@@ -187,7 +183,7 @@ void Executor<Dtype, P>::InitMemory() {
LoadMemory
(
reinterpret_cast
<
void
**>
(
&
data
),
var_desc
,
tensor
);
delete
[]
origin_data
;
}
else
{
if
(
var_desc
->
Type
()
==
framework
::
VARTYPE_TYPE_LOD_TENSOR
)
{
if
(
var_desc
->
Type
()
==
VARTYPE_TYPE_LOD_TENSOR
)
{
varInputMemory
(
var_desc
,
var
,
tensor
);
}
}
...
...
@@ -195,8 +191,8 @@ void Executor<Dtype, P>::InitMemory() {
}
}
template
<
typename
D
type
,
Precision
P
>
void
Executor
<
D
type
,
P
>::
InitCombineMemory
()
{
template
<
typename
D
evice
,
typename
T
>
void
Executor
<
D
evice
,
T
>::
InitCombineMemory
()
{
char
*
origin_data
=
nullptr
;
bool
self_alloc
=
false
;
if
(
program_
.
combined_params_buf
&&
program_
.
combined_params_len
)
{
...
...
@@ -208,17 +204,17 @@ void Executor<Dtype, P>::InitCombineMemory() {
}
PADDLE_MOBILE_ENFORCE
(
origin_data
!=
nullptr
,
"data == nullptr"
);
char
*
data
=
origin_data
;
for
(
const
auto
&
block
:
to_predict_program
_
->
Blocks
())
{
for
(
const
auto
&
block
:
program_desc
_
->
Blocks
())
{
for
(
const
auto
&
var_desc
:
block
->
Vars
())
{
auto
var
=
program_
.
scope
->
Var
(
var_desc
->
Name
());
auto
tensor
=
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
auto
tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
if
(
var_desc
->
Persistable
())
{
if
(
var_desc
->
Name
()
==
"feed"
||
var_desc
->
Name
()
==
"fetch"
)
{
continue
;
}
LoadMemory
(
reinterpret_cast
<
void
**>
(
&
data
),
var_desc
,
tensor
);
}
else
{
if
(
var_desc
->
Type
()
==
framework
::
VARTYPE_TYPE_LOD_TENSOR
)
{
if
(
var_desc
->
Type
()
==
VARTYPE_TYPE_LOD_TENSOR
)
{
varInputMemory
(
var_desc
,
var
,
tensor
);
}
}
...
...
@@ -230,168 +226,132 @@ void Executor<Dtype, P>::InitCombineMemory() {
LOG
(
kLOG_INFO
)
<<
"init combine memory finish"
;
}
template
<
typename
D
type
,
Precision
P
>
bool
Executor
<
D
type
,
P
>::
varInputMemory
(
const
std
::
shared_ptr
<
framework
::
VarDesc
>
&
var_desc
,
Variable
*
var
,
framework
::
LoDTensor
*
tensor
)
const
{
template
<
typename
D
evice
,
typename
T
>
bool
Executor
<
D
evice
,
T
>::
varInputMemory
(
const
std
::
shared_ptr
<
VarDesc
>
&
var_desc
,
Variable
*
var
,
LoDTensor
*
tensor
)
const
{
auto
type
=
var_desc
->
Tensor_desc
().
DataType
();
switch
(
type
)
{
case
framework
::
VARTYPE_TYPE_FP32
:
case
VARTYPE_TYPE_FP32
:
tensor
->
mutable_data
<
float
>
();
break
;
case
framework
::
VARTYPE_TYPE_INT8
:
case
VARTYPE_TYPE_INT8
:
tensor
->
mutable_data
<
int8_t
>
();
break
;
case
framework
::
VARTYPE_TYPE_INT32
:
case
VARTYPE_TYPE_INT32
:
tensor
->
mutable_data
<
int32_t
>
();
break
;
case
framework
::
VARTYPE_TYPE_INT64
:
case
VARTYPE_TYPE_INT64
:
tensor
->
mutable_data
<
int64_t
>
();
break
;
default:
break
;
}
bool
is_mute_match
=
(
type
==
framework
::
VARTYPE_TYPE_FP32
)
||
(
type
==
framework
::
VARTYPE_TYPE_INT8
)
||
(
type
==
framework
::
VARTYPE_TYPE_INT32
)
||
(
type
==
framework
::
VARTYPE_TYPE_INT64
);
bool
is_mute_match
=
(
type
==
VARTYPE_TYPE_FP32
)
||
(
type
==
VARTYPE_TYPE_INT8
)
||
(
type
==
VARTYPE_TYPE_INT32
)
||
(
type
==
VARTYPE_TYPE_INT64
);
PADDLE_MOBILE_ENFORCE
(
is_mute_match
,
"got unhandled data type : %d"
,
type
);
return
is_mute_match
;
}
template
<
typename
Dtype
,
Precision
P
>
std
::
shared_ptr
<
framework
::
Tensor
>
Executor
<
Dtype
,
P
>::
Predict
(
const
framework
::
Tensor
&
t
)
{
framework
::
Variable
*
g_feed_value
=
program_
.
scope
->
Var
(
"feed"
);
framework
::
Tensor
*
feed_tensor
=
g_feed_value
->
GetMutable
<
framework
::
LoDTensor
>
();
feed_tensor
->
Resize
(
t
.
dims
());
feed_tensor
->
ShareDataWith
(
t
);
std
::
shared_ptr
<
framework
::
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
0
);
auto
&
ops
=
ops_of_block_
[
*
to_predict_block
.
get
()];
#ifdef PADDLE_MOBILE_PROFILE
std
::
vector
<
ProfInfo
>
profile
(
ops
.
size
());
#endif
for
(
int
i
=
0
;
i
<
ops
.
size
();
i
++
)
{
#ifdef PADDLE_MOBILE_PROFILE
struct
timespec
ts
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
profile
[
i
].
runBegin
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
#endif
if
(
loddable_
)
{
ops
[
i
]
->
InferShape
();
}
// to Run
ops
[
i
]
->
Run
();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
profile
[
i
].
runEnd
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
#endif
}
auto
last_op
=
ops
.
rbegin
();
auto
output_map
=
(
*
last_op
)
->
Outputs
();
std
::
vector
<
std
::
string
>
out_keys
=
(
*
last_op
)
->
GetOutKeys
();
PADDLE_MOBILE_ENFORCE
(
out_keys
.
size
()
>
0
,
"the last op contains no output"
);
framework
::
LoDTensor
*
output_tensor
=
framework
::
GetVarValue
<
framework
::
LoDTensor
>
(
out_keys
[
0
],
output_map
,
*
(
program_
.
scope
));
#ifdef PADDLE_MOBILE_PROFILE
std
::
unordered_map
<
std
::
string
,
uint64_t
>
_tp
;
for
(
int
i
=
0
;
i
<
profile
.
size
();
i
++
)
{
const
auto
&
pInfo
=
profile
[
i
];
uint64_t
timeCost
=
pInfo
.
runEnd
-
pInfo
.
runBegin
;
if
(
ops
[
i
]
->
Type
()
==
"conv2d"
)
{
auto
inputs
=
ops
[
i
]
->
Inputs
();
auto
*
filter
=
framework
::
GetVarValue
<
framework
::
LoDTensor
>
(
"Filter"
,
inputs
,
*
(
program_
.
scope
));
int
kernel_size
=
filter
->
dims
()[
2
];
_tp
[
ops
[
i
]
->
Type
()
+
"_"
+
std
::
to_string
(
kernel_size
)]
+=
timeCost
;
}
else
{
_tp
[
ops
[
i
]
->
Type
()]
+=
timeCost
;
}
}
printf
(
"====================[ profile ]======================
\n
"
);
using
prof_t
=
std
::
pair
<
std
::
string
,
uint64_t
>
;
std
::
vector
<
prof_t
>
_tv
(
_tp
.
begin
(),
_tp
.
end
());
uint64_t
_ptotal
=
0
;
for
(
auto
const
&
p
:
_tv
)
{
_ptotal
+=
p
.
second
;
template
<
typename
Device
,
typename
T
>
PMStatus
Executor
<
Device
,
T
>::
Predict
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
Tensor
>>
&
inputs
)
{
for
(
const
auto
&
input
:
inputs
)
{
SetInput
(
input
.
second
,
input
.
first
);
}
auto
compf
=
[](
const
prof_t
&
a
,
const
prof_t
&
b
)
{
return
a
.
second
>
b
.
second
;
};
std
::
sort
(
_tv
.
begin
(),
_tv
.
end
(),
compf
);
_tv
.
push_back
(
std
::
make_pair
(
"total"
,
_ptotal
));
for
(
auto
const
&
p
:
_tv
)
{
printf
(
"%-16s
\t
%-10.0f
\t
%-2.4f
\n
"
,
p
.
first
.
c_str
(),
static_cast
<
float
>
(
p
.
second
),
static_cast
<
float
>
(
p
.
second
)
/
_ptotal
*
100.0
);
return
this
->
Predict
();
}
template
<
typename
Device
,
typename
T
>
PMStatus
Executor
<
Device
,
T
>::
Predict
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
LoDTensor
>>
&
inputs
)
{
for
(
const
auto
&
input
:
inputs
)
{
SetInput
(
input
.
second
,
input
.
first
);
}
printf
(
"====================[---------]======================
\n
"
);
#endif
return
std
::
make_shared
<
framework
::
Tensor
>
(
framework
::
Tensor
(
*
output_tensor
));
return
this
->
Predict
();
}
template
<
typename
Dtype
,
Precision
P
>
std
::
shared_ptr
<
framework
::
LoDTensor
>
Executor
<
Dtype
,
P
>::
PredictLod
(
const
framework
::
LoDTensor
&
t
)
{
framework
::
Variable
*
g_feed_value
=
program_
.
scope
->
Var
(
"feed"
);
framework
::
LoDTensor
*
feed_tensor
=
g_feed_value
->
GetMutable
<
framework
::
LoDTensor
>
();
feed_tensor
->
Resize
(
t
.
dims
());
feed_tensor
->
ShareDataWith
(
t
);
feed_tensor
->
set_lod
(
t
.
lod
());
template
<
typename
Device
,
typename
T
>
std
::
vector
<
T
>
Executor
<
Device
,
T
>::
Predict
(
const
std
::
vector
<
T
>
&
input
,
const
std
::
vector
<
int64_t
>
&
dims
)
{
Tensor
feed_tensor
(
input
,
make_ddim
(
dims
));
SetInput
(
feed_tensor
,
"feed"
);
std
::
vector
<
T
>
output
;
if
(
this
->
Predict
()
==
PMSuccess
)
{
const
auto
output_tensor
=
GetOutput
(
"fetch"
);
output
.
resize
(
output_tensor
->
numel
());
memcpy
(
output
.
data
(),
output_tensor
->
template
data
<
T
>(),
output
.
size
()
*
sizeof
(
T
));
}
return
output
;
}
std
::
shared_ptr
<
framework
::
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
0
);
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
SetInput
(
const
Tensor
&
input
,
const
std
::
string
&
var_name
)
{
auto
*
target_var
=
program_
.
scope
->
FindVar
(
var_name
);
PADDLE_MOBILE_ENFORCE
(
target_var
!=
nullptr
,
"Variable %s is not exist"
,
var_name
.
c_str
());
auto
*
target_tensor
=
target_var
->
template
GetMutable
<
LoDTensor
>();
target_tensor
->
Resize
(
input
.
dims
());
target_tensor
->
ShareDataWith
(
input
);
}
auto
&
ops
=
ops_of_block_
[
*
to_predict_block
.
get
()];
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
SetInput
(
const
LoDTensor
&
input
,
const
std
::
string
&
var_name
)
{
auto
*
target_var
=
program_
.
scope
->
FindVar
(
var_name
);
PADDLE_MOBILE_ENFORCE
(
target_var
!=
nullptr
,
"Variable %s is not exist"
,
var_name
.
c_str
());
auto
*
target_tensor
=
target_var
->
template
GetMutable
<
LoDTensor
>();
target_tensor
->
Resize
(
input
.
dims
());
target_tensor
->
ShareDataWith
(
input
);
target_tensor
->
set_lod
(
input
.
lod
());
}
template
<
typename
Device
,
typename
T
>
PMStatus
Executor
<
Device
,
T
>::
Predict
()
{
#ifdef PADDLE_MOBILE_PROFILE
std
::
vector
<
ProfInfo
>
profile
(
ops
.
size
());
std
::
vector
<
ProfInfo
>
profile
(
ops_list_
.
size
());
struct
timespec
ts
;
int
op_index
=
0
;
#endif
for
(
int
i
=
0
;
i
<
ops
.
size
();
i
++
)
{
for
(
auto
&
block
:
ops_of_block_
)
{
for
(
auto
&
op_handler
:
block
)
{
#ifdef PADDLE_MOBILE_PROFILE
struct
timespec
ts
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
profile
[
i
].
runBegin
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
profile
[
op_index
].
runBegin
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
#endif
if
(
loddabl
e_
)
{
ops
[
i
]
->
InferShape
();
}
ops
[
i
]
->
Run
();
if
(
lod_mod
e_
)
{
op_handler
->
InferShape
();
}
op_handler
->
Run
();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
profile
[
i
].
runEnd
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
profile
[
op_index
].
runEnd
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
++
op_index
;
#endif
}
}
auto
last_op
=
ops
.
rbegin
();
auto
output_map
=
(
*
last_op
)
->
Outputs
();
std
::
vector
<
std
::
string
>
out_keys
=
(
*
last_op
)
->
GetOutKeys
();
PADDLE_MOBILE_ENFORCE
(
out_keys
.
size
()
>
0
,
"the last op contains no output"
);
framework
::
LoDTensor
*
output_tensor
=
framework
::
GetVarValue
<
framework
::
LoDTensor
>
(
out_keys
[
0
],
output_map
,
*
(
program_
.
scope
));
#ifdef PADDLE_MOBILE_PROFILE
std
::
unordered_map
<
std
::
string
,
uint64_t
>
_tp
;
for
(
int
i
=
0
;
i
<
profile
.
size
();
i
++
)
{
const
auto
&
pInfo
=
profile
[
i
];
uint64_t
timeCost
=
pInfo
.
runEnd
-
pInfo
.
runBegin
;
if
(
ops
[
i
]
->
Type
()
==
"conv2d"
)
{
auto
inputs
=
ops
[
i
]
->
Inputs
();
auto
input
_keys
=
ops
[
i
]
->
GetInputKey
s
();
auto
*
filter
=
framework
::
GetVarValue
<
framework
::
LoDTensor
>
(
input_keys
[
1
]
,
inputs
,
*
(
program_
.
scope
));
if
(
ops
_list_
[
i
]
->
Type
()
==
"conv2d"
||
ops_list_
[
i
]
->
Type
()
==
"depthwise_conv2d"
)
{
auto
input
s
=
ops_list_
[
i
]
->
Input
s
();
auto
*
filter
=
GetVarValue
<
LoDTensor
>
(
"Filter"
,
inputs
,
*
(
program_
.
scope
));
int
kernel_size
=
filter
->
dims
()[
2
];
printf
(
"kernel size: %d
\n
"
,
kernel_size
);
_tp
[
ops_list_
[
i
]
->
Type
()
+
"_"
+
std
::
to_string
(
kernel_size
)]
+=
timeCost
;
}
else
{
_tp
[
ops_list_
[
i
]
->
Type
()]
+=
timeCost
;
}
_tp
[
ops
[
i
]
->
Type
()]
+=
timeCost
;
}
printf
(
"====================[ profile ]======================
\n
"
)
;
using
prof_t
=
std
::
pair
<
std
::
string
,
uint64_t
>
;
DLOG
<<
"====================[ profile ]======================"
;
typedef
std
::
pair
<
std
::
string
,
uint64_t
>
prof_t
;
std
::
vector
<
prof_t
>
_tv
(
_tp
.
begin
(),
_tp
.
end
());
uint64_t
_ptotal
=
0
;
for
(
auto
const
&
p
:
_tv
)
{
...
...
@@ -407,57 +367,39 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
static_cast
<
float
>
(
p
.
second
),
static_cast
<
float
>
(
p
.
second
)
/
_ptotal
*
100.0
);
}
printf
(
"====================[---------]======================
\n
"
)
;
DLOG
<<
"====================[---------]======================"
;
#endif
return
std
::
make_shared
<
framework
::
LoDTensor
>
(
framework
::
LoDTensor
(
*
output_tensor
));
return
PMSuccess
;
}
template
<
typename
Dtype
,
Precision
P
>
std
::
shared_ptr
<
framework
::
Tensor
>
Executor
<
Dtype
,
P
>::
Predict
(
const
framework
::
Tensor
&
t
,
int
block_id
)
{
return
Predict
(
t
);
}
template
<
typename
Dtype
,
Precision
P
>
std
::
vector
<
typename
Executor
<
Dtype
,
P
>::
Ptype
>
Executor
<
Dtype
,
P
>::
Predict
(
const
std
::
vector
<
Ptype
>
&
input
,
const
std
::
vector
<
int64_t
>
&
dims
)
{
framework
::
Tensor
tensor
(
input
,
framework
::
make_ddim
(
dims
));
std
::
shared_ptr
<
framework
::
Tensor
>
output_tensor
=
Predict
(
tensor
,
0
);
if
(
output_tensor
!=
nullptr
)
{
Executor
<
Dtype
,
P
>::
Ptype
*
output_ptr
=
output_tensor
->
data
<
typename
Executor
<
Dtype
,
P
>::
Ptype
>
();
std
::
vector
<
typename
Executor
<
Dtype
,
P
>::
Ptype
>
result_vector
;
for
(
int
j
=
0
;
j
<
output_tensor
->
numel
();
++
j
)
{
result_vector
.
push_back
(
output_ptr
[
j
]);
}
return
result_vector
;
}
else
{
DLOG
<<
"return empty vector"
;
return
{};
}
template
<
typename
Device
,
typename
T
>
std
::
shared_ptr
<
LoDTensor
>
Executor
<
Device
,
T
>::
GetOutput
(
const
std
::
string
&
var_name
)
{
auto
*
target_var
=
program_
.
scope
->
FindVar
(
var_name
);
PADDLE_MOBILE_ENFORCE
(
target_var
!=
nullptr
,
"Variable %s is not exist"
,
var_name
.
c_str
());
auto
*
output_tensor
=
target_var
->
template
GetMutable
<
LoDTensor
>();
return
std
::
make_shared
<
LoDTensor
>
(
*
output_tensor
);
}
#ifdef PADDLE_MOBILE_FPGA
template
<
typename
Dtype
,
Precision
P
>
void
Executor
<
Dtype
,
P
>::
InjectVariable
(
const
framework
::
Tensor
&
t
,
std
::
string
var_name
)
{
framework
::
Variable
*
g_feed_value
=
program_
.
scope
->
Var
(
var_name
);
framework
::
Tensor
*
feed_tensor
=
g_feed_value
->
GetMutable
<
framework
::
LoDTensor
>
();
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
InjectVariable
(
const
Tensor
&
t
,
std
::
string
var_name
)
{
Variable
*
g_feed_value
=
program_
.
scope
->
Var
(
var_name
);
Tensor
*
feed_tensor
=
g_feed_value
->
GetMutable
<
LoDTensor
>
();
feed_tensor
->
Resize
(
t
.
dims
());
feed_tensor
->
ShareDataWith
(
t
);
}
template
<
typename
D
type
,
Precision
P
>
void
Executor
<
D
type
,
P
>::
FeedData
(
const
framework
::
Tensor
&
t
)
{
template
<
typename
D
evice
,
typename
T
>
void
Executor
<
D
evice
,
T
>::
FeedData
(
const
Tensor
&
t
)
{
InjectVariable
(
t
,
"feed"
);
}
template
<
typename
Dtype
,
Precision
P
>
std
::
shared_ptr
<
framework
::
Tensor
>
Executor
<
Dtype
,
P
>::
FetchResult
(
int
id
)
{
std
::
shared_ptr
<
framework
::
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
0
);
template
<
typename
Device
,
typename
T
>
std
::
shared_ptr
<
Tensor
>
Executor
<
Device
,
T
>::
FetchResult
(
int
id
)
{
std
::
shared_ptr
<
BlockDesc
>
to_predict_block
=
program_desc_
->
Block
(
0
);
auto
&
ops
=
ops_of_block_
[
*
to_predict_block
.
get
()];
PADDLE_MOBILE_ENFORCE
(
id
<
(
int
)
ops
.
size
(),
"Index out of range"
);
...
...
@@ -465,15 +407,14 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
auto
output_map
=
op
->
Outputs
();
std
::
vector
<
std
::
string
>
out_keys
=
op
->
GetOutKeys
();
PADDLE_MOBILE_ENFORCE
(
!
out_keys
.
empty
(),
"this op contains no output"
);
auto
*
output_tensor
=
framework
::
GetVarValue
<
framework
::
LoDTensor
>
(
out_keys
[
0
],
output_map
,
*
(
program_
.
scope
));
return
std
::
make_shared
<
framework
::
Tensor
>
(
framework
::
Tensor
(
*
output_tensor
));
auto
*
output_tensor
=
GetVarValue
<
LoDTensor
>
(
out_keys
[
0
],
output_map
,
*
(
program_
.
scope
));
return
std
::
make_shared
<
Tensor
>
(
Tensor
(
*
output_tensor
));
}
template
<
typename
Dtype
,
Precision
P
>
void
Executor
<
Dtype
,
P
>::
Predict_From_To
(
int
start
,
int
end
)
{
std
::
shared_ptr
<
framework
::
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
0
);
template
<
typename
Device
,
typename
T
>
void
Executor
<
Device
,
T
>::
Predict_From_To
(
int
start
,
int
end
)
{
std
::
shared_ptr
<
BlockDesc
>
to_predict_block
=
program_desc_
->
Block
(
0
);
auto
&
ops
=
ops_of_block_
[
*
to_predict_block
.
get
()];
end
=
end
<
0
?
static_cast
<
int
>
(
ops
.
size
())
:
end
;
PADDLE_MOBILE_ENFORCE
(
start
>=
0
&&
start
<
end
&&
end
<=
ops
.
size
(),
...
...
@@ -498,25 +439,26 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
}
}
template
<
typename
D
type
,
Precision
P
>
void
Executor
<
D
type
,
P
>::
Predict_From
(
int
start
)
{
template
<
typename
D
evice
,
typename
T
>
void
Executor
<
D
evice
,
T
>::
Predict_From
(
int
start
)
{
Predict_From_To
(
start
);
}
template
<
typename
D
type
,
Precision
P
>
void
Executor
<
D
type
,
P
>::
Predict_To
(
int
end
)
{
template
<
typename
D
evice
,
typename
T
>
void
Executor
<
D
evice
,
T
>::
Predict_To
(
int
end
)
{
Predict_From_To
(
0
,
end
);
}
#endif
#ifdef PADDLE_MOBILE_CL
template
<
typename
D
type
,
Precision
P
>
void
Executor
<
D
type
,
P
>::
LoadMemory
(
const
framework
::
VarDesc
var_desc
,
float
*
tensorInput
,
char
**
data
)
{}
template
<
typename
D
evice
,
typename
T
>
void
Executor
<
D
evice
,
T
>::
LoadMemory
(
const
VarDesc
var_desc
,
float
*
tensorInput
,
char
**
data
)
{}
template
<
>
void
Executor
<
GPU_CL
,
Precision
::
FP32
>::
LoadMemory
(
const
framework
::
VarDesc
var_desc
,
float
*
tensorInput
,
char
**
data
)
{
void
Executor
<
GPU_CL
,
Precision
::
FP32
>::
LoadMemory
(
const
VarDesc
var_desc
,
float
*
tensorInput
,
char
**
data
)
{
// 1. version
uint32_t
version
=
*
reinterpret_cast
<
uint32_t
*>
(
*
data
);
...
...
@@ -554,38 +496,13 @@ void Executor<GPU_CL, Precision::FP32>::LoadMemory(
}
(
*
data
)
+=
(
sizeof
(
char
)
*
size
);
const
framework
::
TensorDesc
&
desc
=
var_desc
.
Tensor_desc
();
const
TensorDesc
&
desc
=
var_desc
.
Tensor_desc
();
int
memory_size
=
1
;
for
(
auto
l
:
desc
.
Dims
())
{
memory_size
*=
l
;
}
void
*
memory
=
nullptr
;
// int type_size = 0;
// switch (desc.DataType()) {
// case framework::VARTYPE_TYPE_FP16:
// type_size = 2;
// break;
// case framework::VARTYPE_TYPE_FP32:
// type_size = 4;
// memory = tensor->mutable_data<float>();
// break;
// case framework::VARTYPE_TYPE_FP64:
// type_size = 8;
// break;
// case framework::VARTYPE_TYPE_INT32:
// memory = tensor->mutable_data<int32_t>();
// type_size = 4;
// break;
// case framework::VARTYPE_TYPE_INT64:
// type_size = 8;
// break;
// case framework::VARTYPE_TYPE_BOOL:
// type_size = 1;
// break;
// default:
// break;
// }
int
type_size
=
4
;
memory
=
tensorInput
;
if
(
program_
.
quantification
)
{
...
...
@@ -616,24 +533,24 @@ void Executor<GPU_CL, Precision::FP32>::LoadMemory(
}
template
<
>
void
Executor
<
GPU_CL
,
Precision
::
FP32
>::
InitMemory
()
{
for
(
const
auto
&
block
:
to_predict_program
_
->
Blocks
())
{
void
Executor
<
GPU_CL
,
float
>::
InitMemory
()
{
for
(
const
auto
&
block
:
program_desc
_
->
Blocks
())
{
for
(
const
auto
&
var_desc
:
block
->
Vars
())
{
auto
var
=
program_
.
scope
->
Var
(
var_desc
->
Name
());
if
(
var_desc
->
Persistable
())
{
CLImage
*
cl_image
=
nullptr
;
if
(
var_desc
->
Name
()
==
"feed"
||
var_desc
->
Name
()
==
"fetch"
)
{
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
var
->
template
GetMutable
<
LoDTensor
>();
continue
;
}
else
{
cl_image
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
cl_image
=
var
->
template
GetMutable
<
CLImage
>();
}
char
*
origin_data
=
ReadFileToBuff
(
program_
.
model_path
+
"/"
+
var_desc
->
Name
());
char
*
data
=
origin_data
;
cl_context
context
=
program_
.
scope
->
GetCLScpoe
()
->
Context
();
const
framework
::
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
const
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
int
numel
=
1
;
for
(
auto
l
:
desc
.
Dims
())
{
numel
*=
l
;
...
...
@@ -643,7 +560,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
numel
));
LoadMemory
(
*
var_desc
,
tensorInput
,
&
data
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
desc
.
Dims
());
DDim
ddim
=
make_ddim
(
desc
.
Dims
());
// has not init
cl_image
->
SetTensorData
(
tensorInput
,
ddim
);
...
...
@@ -651,15 +568,15 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
delete
origin_data
;
paddle_mobile
::
memory
::
Free
(
tensorInput
);
}
else
{
if
(
var_desc
->
Type
()
==
framework
::
VARTYPE_TYPE_LOD_TENSOR
)
{
auto
cl_image
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
if
(
var_desc
->
Type
()
==
VARTYPE_TYPE_LOD_TENSOR
)
{
auto
cl_image
=
var
->
template
GetMutable
<
CLImage
>();
cl_context
context
=
program_
.
scope
->
GetCLScpoe
()
->
Context
();
cl_command_queue
command_queue
=
program_
.
scope
->
GetCLScpoe
()
->
CommandQueue
();
const
framework
::
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
//
framework::DDim ddim = framework::
make_ddim(desc.Dims());
framework
::
DDim
ddim
=
cl_image
->
dims
();
const
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
//
DDim ddim =
make_ddim(desc.Dims());
DDim
ddim
=
cl_image
->
dims
();
DLOG
<<
var_desc
->
Name
();
cl_image
->
InitEmptyImage
(
context
,
command_queue
,
ddim
);
}
...
...
@@ -669,7 +586,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
}
template
<
>
void
Executor
<
GPU_CL
,
Precision
::
FP32
>::
InitCombineMemory
()
{
void
Executor
<
GPU_CL
,
float
>::
InitCombineMemory
()
{
char
*
origin_data
=
nullptr
;
bool
self_alloc
=
false
;
if
(
program_
.
combined_params_buf
&&
program_
.
combined_params_len
)
{
...
...
@@ -683,22 +600,22 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
PADDLE_MOBILE_ENFORCE
(
origin_data
!=
nullptr
,
"origin_data==nullptr!!!"
);
float
*
data
=
reinterpret_cast
<
float
*>
(
origin_data
);
for
(
const
auto
&
block
:
to_predict_program
_
->
Blocks
())
{
for
(
const
auto
&
block
:
program_desc
_
->
Blocks
())
{
for
(
const
auto
&
var_desc
:
block
->
Vars
())
{
auto
var
=
program_
.
scope
->
Var
(
var_desc
->
Name
());
if
(
var_desc
->
Persistable
())
{
CLImage
*
cl_image
=
nullptr
;
if
(
var_desc
->
Name
()
==
"feed"
||
var_desc
->
Name
()
==
"fetch"
)
{
var
->
template
GetMutable
<
framework
::
LoDTensor
>();
var
->
template
GetMutable
<
LoDTensor
>();
continue
;
}
else
{
cl_image
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
cl_image
=
var
->
template
GetMutable
<
CLImage
>();
}
cl_context
context
=
program_
.
scope
->
GetCLScpoe
()
->
Context
();
const
framework
::
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
framework
::
DDim
ddim
=
framework
::
make_ddim
(
desc
.
Dims
());
const
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
DDim
ddim
=
make_ddim
(
desc
.
Dims
());
int
numel
=
1
;
for
(
int
i
=
0
;
i
<
ddim
.
size
();
i
++
)
{
...
...
@@ -713,13 +630,13 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
paddle_mobile
::
memory
::
Free
(
tensorInput
);
}
else
{
auto
cl_image
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
auto
cl_image
=
var
->
template
GetMutable
<
CLImage
>();
cl_context
context
=
program_
.
scope
->
GetCLScpoe
()
->
Context
();
cl_command_queue
command_queue
=
program_
.
scope
->
GetCLScpoe
()
->
CommandQueue
();
const
framework
::
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
framework
::
DDim
ddim
=
cl_image
->
dims
();
//
framework::DDim ddim = framework::
make_ddim(desc.Dims());
const
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
DDim
ddim
=
cl_image
->
dims
();
//
DDim ddim =
make_ddim(desc.Dims());
cl_image
->
InitEmptyImage
(
context
,
command_queue
,
ddim
);
}
}
...
...
@@ -732,13 +649,13 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
#endif
template
class
Executor
<
CPU
,
Precision
::
FP32
>;
template
class
Executor
<
CPU
,
float
>;
template
class
Executor
<
FPGA
,
Precision
::
FP32
>;
template
class
Executor
<
FPGA
,
float
>;
template
class
Executor
<
GPU_CL
,
Precision
::
FP32
>;
template
class
Executor
<
GPU_CL
,
float
>;
template
class
Executor
<
GPU_MALI
,
Precision
::
FP32
>;
template
class
Executor
<
GPU_MALI
,
float
>;
}
// namespace framework
}
// namespace paddle_mobile
src/framework/executor.h
浏览文件 @
9729edac
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "common/types.h"
#include "common/util.h"
...
...
@@ -28,41 +29,29 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
framework
{
template
<
typename
D
type
=
CPU
,
Precision
P
=
Precision
::
FP32
>
template
<
typename
D
evice
,
typename
T
=
float
>
class
Executor
{
public:
typedef
typename
PrecisionTrait
<
P
>::
ptype
Ptype
;
// exector constructor
// @param program program converted from proto program in PaddlePaddle
// @param use_optimize bool whether use operator fusion to speed up or not
// @param loddable bool
Executor
(
const
framework
::
Program
<
Dtype
>
program
,
int
batch_size
=
1
,
const
bool
use_optimize
=
true
,
const
bool
loddable
=
false
);
// predict with tensor input
// @param t input tensor to do prediction
// @return predicted tensor
std
::
shared_ptr
<
framework
::
Tensor
>
Predict
(
const
framework
::
Tensor
&
t
);
// predict with lod tensor input
// @param t input lod tensor to do prediction
// @return predicted lod tensor
std
::
shared_ptr
<
framework
::
LoDTensor
>
PredictLod
(
const
framework
::
LoDTensor
&
t
);
// predict with vector input and dims
// @param input vector whose elements will be formed
// @param input lod tensor to do prediction
// @param dims vector whose elements will be formed
// @param input tensor shape
// @return vector which is flatted from predicted tensor
std
::
vector
<
Ptype
>
Predict
(
const
std
::
vector
<
Ptype
>
&
input
,
const
std
::
vector
<
int64_t
>
&
dims
);
Executor
(
const
Program
<
Device
>
&
program
,
int
batch_size
=
1
,
const
bool
use_optimize
=
true
,
const
bool
lod_mode
=
false
);
PMStatus
Predict
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
Tensor
>>
&
inputs
);
PMStatus
Predict
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
LoDTensor
>>
&
inputs
);
std
::
vector
<
T
>
Predict
(
const
std
::
vector
<
T
>
&
input
,
const
std
::
vector
<
int64_t
>
&
dims
);
PMStatus
Predict
();
void
SetInput
(
const
Tensor
&
input
,
const
std
::
string
&
var_name
);
void
SetInput
(
const
LoDTensor
&
input
,
const
std
::
string
&
var_name
);
std
::
shared_ptr
<
LoDTensor
>
GetOutput
(
const
std
::
string
&
var_name
);
#ifdef PADDLE_MOBILE_FPGA
void
InjectVariable
(
const
framework
::
Tensor
&
t
,
std
::
string
var_name
);
void
FeedData
(
const
framework
::
Tensor
&
t
);
std
::
shared_ptr
<
framework
::
Tensor
>
FetchResult
(
int
id
=
-
1
);
void
InjectVariable
(
const
Tensor
&
t
,
std
::
string
var_name
);
void
FeedData
(
const
Tensor
&
t
);
std
::
shared_ptr
<
Tensor
>
FetchResult
(
int
id
=
-
1
);
void
Predict_From_To
(
int
start
=
0
,
int
end
=
-
1
);
void
Predict_From
(
int
start
);
void
Predict_To
(
int
end
);
...
...
@@ -70,26 +59,28 @@ class Executor {
protected:
Executor
()
=
default
;
std
::
shared_ptr
<
framework
::
Tensor
>
Predict
(
const
framework
::
Tensor
&
t
,
int
block_id
);
bool
varInputMemory
(
const
std
::
shared_ptr
<
framework
::
VarDesc
>
&
var_desc
,
framework
::
Variable
*
var
,
framework
::
LoDTensor
*
tensor
)
const
;
bool
varInputMemory
(
const
std
::
shared_ptr
<
VarDesc
>
&
var_desc
,
Variable
*
var
,
LoDTensor
*
tensor
)
const
;
void
InitMemory
();
void
InitCombineMemory
();
void
LoadMemory
(
void
**
data
,
const
std
::
shared_ptr
<
framework
::
VarDesc
>
var_desc
,
framework
::
LoDTensor
*
tensor
);
void
LoadMemory
(
void
**
data
,
const
std
::
shared_ptr
<
VarDesc
>
var_desc
,
LoDTensor
*
tensor
);
#ifdef PADDLE_MOBILE_CL
void
LoadMemory
(
const
framework
::
VarDesc
var_desc
,
float
*
tensorInput
,
char
**
data
);
void
LoadMemory
(
const
VarDesc
var_desc
,
float
*
tensorInput
,
char
**
data
);
#endif
framework
::
Program
<
Dtype
>
program_
;
int
batch_size_
=
1
;
std
::
shared_ptr
<
framework
::
ProgramDesc
>
to_predict_program_
;
std
::
map
<
framework
::
BlockDesc
,
std
::
vector
<
std
::
shared_ptr
<
framework
::
OperatorBase
<
Dtype
>>>>
ops_of_block_
;
int
batch_size_
;
bool
use_optimize_
;
bool
lod_mode_
;
Program
<
Device
>
program_
;
std
::
shared_ptr
<
ProgramDesc
>
program_desc_
;
typedef
std
::
shared_ptr
<
OperatorBase
<
Device
>>
OperatorBasePtr
;
std
::
vector
<
std
::
vector
<
OperatorBasePtr
>>
ops_of_block_
;
// operators list
std
::
vector
<
OperatorBasePtr
>
ops_list_
;
#ifdef PADDLE_MOBILE_PROFILE
struct
ProfInfo
{
int
tid
=
0
;
...
...
@@ -97,8 +88,6 @@ class Executor {
uint64_t
runEnd
=
0UL
;
};
#endif
bool
use_optimize_
=
false
;
bool
loddable_
=
false
;
};
}
// namespace framework
...
...
src/framework/loader.cpp
浏览文件 @
9729edac
...
...
@@ -23,14 +23,8 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
framework
{
/**
* muteandresize tensor as originProgramDesc and scope in loadParams
*
* @param originProgramDesc
* @param scope
*/
template
<
typename
Dtype
,
Precision
P
>
void
Loader
<
Dtype
,
P
>::
InitMemoryFromProgram
(
template
<
typename
Device
,
typename
T
>
void
Loader
<
Device
,
T
>::
InitMemoryFromProgram
(
const
std
::
shared_ptr
<
ProgramDesc
>
&
originProgramDesc
,
const
std
::
shared_ptr
<
Scope
>
&
scope
)
{
for
(
const
auto
&
block
:
originProgramDesc
.
get
()
->
Blocks
())
{
...
...
@@ -43,8 +37,6 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
tensor
->
Resize
(
make_ddim
(
dim
));
}
else
{
auto
dim
=
var_desc
->
Tensor_desc
().
Dims
();
// PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
// dim[0] = 1;
if
(
dim
.
size
()
==
0
)
{
auto
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
framework
::
DDim
dDim
=
{
0
};
...
...
@@ -60,7 +52,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
}
}
}
else
{
// TODO(codeWorm)
: some.
// TODO(codeWorm)
}
}
}
...
...
@@ -68,7 +60,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
#ifdef PADDLE_MOBILE_CL
template
<
>
void
Loader
<
GPU_CL
,
Precision
::
FP32
>::
InitMemoryFromProgram
(
void
Loader
<
GPU_CL
,
float
>::
InitMemoryFromProgram
(
const
std
::
shared_ptr
<
ProgramDesc
>
&
originProgramDesc
,
const
std
::
shared_ptr
<
Scope
>
&
scope
)
{
for
(
const
auto
&
block
:
originProgramDesc
.
get
()
->
Blocks
())
{
...
...
@@ -77,7 +69,6 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
if
(
var_desc
->
Type
()
==
VARTYPE_TYPE_LOD_TENSOR
)
{
if
(
var_desc
->
Persistable
())
{
auto
dim
=
var_desc
->
Tensor_desc
().
Dims
();
// auto tensor = var->GetMutable<LoDTensor>();
auto
cl_image
=
var
->
GetMutable
<
framework
::
CLImage
>
();
cl_image
->
Resize
(
make_ddim
(
dim
));
}
else
{
...
...
@@ -88,14 +79,13 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
cl_image
->
Resize
(
make_ddim
(
dim
));
}
}
else
{
// TODO(codeWorm)
: some.
// TODO(codeWorm)
}
}
}
}
template
<
>
const
Program
<
GPU_CL
,
Precision
::
FP32
>
Loader
<
GPU_CL
,
Precision
::
FP32
>::
LoadCombinedMemory
(
const
Program
<
GPU_CL
,
float
>
Loader
<
GPU_CL
,
float
>::
LoadCombinedMemory
(
size_t
read_size
,
const
uint8_t
*
buf
,
size_t
combined_params_len
,
uint8_t
*
combined_params_buf
,
bool
optimize
,
bool
quantification
)
{
bool
can_add_split
=
false
;
...
...
@@ -113,7 +103,7 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
auto
originProgramDesc
=
std
::
make_shared
<
ProgramDesc
>
(
c_program
);
Program
<
GPU_CL
,
Precision
::
FP32
>
program
;
Program
<
GPU_CL
,
float
>
program
;
program
.
combined
=
true
;
program
.
originProgram
=
originProgramDesc
;
program
.
quantification
=
quantification
;
...
...
@@ -145,16 +135,16 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
/**
* fusion and print someinfos
* @tparam D
typ
e
* @tparam D
evic
e
* @tparam P
* @param optimize
* @param can_add_split
* @param program
* @param originProgramDesc
*/
template
<
typename
D
type
,
Precision
P
>
template
<
typename
D
evice
,
typename
T
>
void
FusionAndPrintInfos
(
bool
optimize
,
bool
can_add_split
,
Program
<
D
type
,
P
>
*
program
,
bool
optimize
,
bool
can_add_split
,
Program
<
D
evice
,
T
>
*
program
,
const
std
::
shared_ptr
<
ProgramDesc
>
&
originProgramDesc
)
{
if
(
optimize
)
{
ProgramOptimize
program_optimize
;
...
...
@@ -193,22 +183,22 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
return
cur_len
;
}
template
<
typename
D
type
,
Precision
P
>
const
Program
<
D
type
,
P
>
Loader
<
Dtype
,
P
>::
Load
(
const
std
::
string
&
dirname
,
bool
optimize
,
bool
quantification
,
bool
can_add_split
)
{
template
<
typename
D
evice
,
typename
T
>
const
Program
<
D
evice
,
T
>
Loader
<
Device
,
T
>::
Load
(
const
std
::
string
&
dirname
,
bool
optimize
,
bool
quantification
,
bool
can_add_split
)
{
auto
program
=
this
->
LoadProgram
(
dirname
+
"/__model__"
,
optimize
,
quantification
,
can_add_split
);
program
.
model_path
=
dirname
;
return
program
;
}
template
<
typename
D
type
,
Precision
P
>
const
Program
<
D
type
,
P
>
Loader
<
Dtype
,
P
>::
Load
(
const
std
::
string
&
model_path
,
const
std
::
string
&
para_path
,
bool
optimize
,
bool
quantification
)
{
template
<
typename
D
evice
,
typename
T
>
const
Program
<
D
evice
,
T
>
Loader
<
Device
,
T
>::
Load
(
const
std
::
string
&
model_path
,
const
std
::
string
&
para_path
,
bool
optimize
,
bool
quantification
)
{
auto
program
=
this
->
LoadProgram
(
model_path
,
optimize
,
quantification
);
program
.
para_path
=
para_path
;
...
...
@@ -217,8 +207,8 @@ const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path,
return
program
;
}
template
<
typename
D
type
,
Precision
P
>
const
Program
<
D
type
,
P
>
Loader
<
Dtype
,
P
>::
LoadProgram
(
template
<
typename
D
evice
,
typename
T
>
const
Program
<
D
evice
,
T
>
Loader
<
Device
,
T
>::
LoadProgram
(
const
std
::
string
&
model_path
,
bool
optimize
,
bool
quantification
,
bool
can_add_split
)
{
std
::
string
model_filename
=
model_path
;
...
...
@@ -237,7 +227,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
//
auto
originProgramDesc
=
std
::
make_shared
<
ProgramDesc
>
(
c_program
);
Program
<
D
type
,
P
>
program
;
Program
<
D
evice
,
T
>
program
;
program
.
originProgram
=
originProgramDesc
;
program
.
quantification
=
quantification
;
program
.
combined_params_len
=
0
;
...
...
@@ -254,8 +244,8 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
return
program
;
}
template
<
typename
D
type
,
Precision
P
>
const
Program
<
D
type
,
P
>
Loader
<
Dtype
,
P
>::
LoadCombinedMemory
(
template
<
typename
D
evice
,
typename
T
>
const
Program
<
D
evice
,
T
>
Loader
<
Device
,
T
>::
LoadCombinedMemory
(
size_t
read_size
,
const
uint8_t
*
buf
,
size_t
combined_params_len
,
uint8_t
*
combined_params_buf
,
bool
optimize
,
bool
quantification
)
{
bool
can_add_split
=
false
;
...
...
@@ -273,7 +263,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
auto
originProgramDesc
=
std
::
make_shared
<
ProgramDesc
>
(
c_program
);
Program
<
D
type
,
P
>
program
;
Program
<
D
evice
,
T
>
program
;
program
.
combined
=
true
;
program
.
originProgram
=
originProgramDesc
;
program
.
quantification
=
quantification
;
...
...
@@ -289,13 +279,13 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
return
program
;
}
template
class
Loader
<
CPU
,
Precision
::
FP32
>;
template
class
Loader
<
CPU
,
float
>;
template
class
Loader
<
FPGA
,
Precision
::
FP32
>;
template
class
Loader
<
FPGA
,
float
>;
template
class
Loader
<
GPU_MALI
,
Precision
::
FP32
>;
template
class
Loader
<
GPU_MALI
,
float
>;
template
class
Loader
<
GPU_CL
,
Precision
::
FP32
>;
template
class
Loader
<
GPU_CL
,
float
>;
}
// namespace framework
}
// namespace paddle_mobile
src/framework/loader.h
浏览文件 @
9729edac
...
...
@@ -22,39 +22,39 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
framework
{
template
<
typename
D
type
=
CPU
,
Precision
P
=
Precision
::
FP32
>
template
<
typename
D
evice
=
CPU
,
typename
T
=
float
>
class
Loader
{
public:
/*
* @b load separate format fluid model
* @b 加载分开
形式的 fluid
模型
* @b 加载分开
存储的fluid
模型
* */
const
Program
<
D
type
,
P
>
Load
(
const
std
::
string
&
dirname
,
bool
optimize
=
false
,
bool
quantification
=
false
,
bool
can_add_split
=
false
);
const
Program
<
D
evice
,
T
>
Load
(
const
std
::
string
&
dirname
,
bool
optimize
=
false
,
bool
quantification
=
false
,
bool
can_add_split
=
false
);
/*
* @b load combine format fluid mode
* @b 加载
结合在一起格式的
模型
* @b 加载
统一存储的fluid
模型
* */
const
Program
<
D
type
,
P
>
Load
(
const
std
::
string
&
model_path
,
const
std
::
string
&
para_path
,
bool
optimize
=
false
,
bool
quantification
=
false
);
const
Program
<
D
evice
,
T
>
Load
(
const
std
::
string
&
model_path
,
const
std
::
string
&
para_path
,
bool
optimize
=
false
,
bool
quantification
=
false
);
const
Program
<
D
type
,
P
>
LoadCombinedMemory
(
size_t
model_len
,
const
uint8_t
*
model_buf
,
size_t
combined_params_len
,
uint8_t
*
combined_params_buf
,
bool
optimize
=
false
,
bool
quantification
=
false
);
const
Program
<
D
evice
,
T
>
LoadCombinedMemory
(
size_t
model_len
,
const
uint8_t
*
model_buf
,
size_t
combined_params_len
,
uint8_t
*
combined_params_buf
,
bool
optimize
=
false
,
bool
quantification
=
false
);
private:
const
Program
<
D
type
,
P
>
LoadProgram
(
const
std
::
string
&
model_path
,
bool
optimize
=
false
,
bool
quantification
=
false
,
bool
can_add_split
=
false
);
const
Program
<
D
evice
,
T
>
LoadProgram
(
const
std
::
string
&
model_path
,
bool
optimize
=
false
,
bool
quantification
=
false
,
bool
can_add_split
=
false
);
void
InitMemoryFromProgram
(
const
std
::
shared_ptr
<
ProgramDesc
>
&
originProgramDesc
,
...
...
src/framework/lod_tensor.h
浏览文件 @
9729edac
...
...
@@ -16,12 +16,12 @@ limitations under the License. */
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "tensor.h"
#include "tensor_util.h"
#include "
framework/
tensor.h"
#include "
framework/
tensor_util.h"
namespace
paddle_mobile
{
namespace
framework
{
/*
...
...
@@ -202,5 +202,29 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor);
void
DeserializeFromStream
(
std
::
istream
&
is
,
LoDTensor
*
tensor
);
#ifdef PADDLE_MOBILE_DEBUG
inline
Print
&
operator
<<
(
Print
&
printer
,
const
LoDTensor
&
tensor
)
{
printer
<<
" dims: "
<<
tensor
.
dims
()
<<
"
\n
"
;
int
stride
=
tensor
.
numel
()
/
20
;
stride
=
stride
>
0
?
stride
:
1
;
#ifndef PADDLE_MOBILE_FPGA
for
(
int
i
=
0
;
i
<
tensor
.
numel
();
i
+=
stride
)
{
if
(
tensor
.
type
()
==
typeid
(
float
))
{
printer
<<
tensor
.
data
<
float
>
()[
i
]
<<
" "
;
}
else
if
(
tensor
.
type
()
==
typeid
(
int32_t
))
{
printer
<<
tensor
.
data
<
int32_t
>
()[
i
]
<<
" "
;
}
else
if
(
tensor
.
type
()
==
typeid
(
int64_t
))
{
printer
<<
tensor
.
data
<
int64_t
>
()[
i
]
<<
" "
;
}
else
if
(
tensor
.
type
()
==
typeid
(
int8_t
))
{
printer
<<
static_cast
<
int
>
(
tensor
.
data
<
int8_t
>
()[
i
])
<<
" "
;
}
else
if
(
tensor
.
type
()
==
typeid
(
int32_t
))
{
printer
<<
tensor
.
data
<
int32_t
>
()[
i
]
<<
" "
;
}
}
#endif // PADDLE_MOBILE_FPGA
return
printer
;
}
#endif // PADDLE_MOBILE_DEBUG
}
// namespace framework
}
// namespace paddle_mobile
src/framework/program/program.h
浏览文件 @
9729edac
...
...
@@ -14,16 +14,15 @@ limitations under the License. */
#pragma once
#include <string>
#include "common/types.h"
#include "framework/program/program_desc.h"
#include "framework/scope.h"
#include <string>
namespace
paddle_mobile
{
namespace
framework
{
template
<
typename
D
type
,
Precision
P
=
Precision
::
FP32
>
template
<
typename
D
evice
,
typename
T
=
float
>
class
Program
{
public:
std
::
shared_ptr
<
ProgramDesc
>
originProgram
;
...
...
src/framework/scope.h
浏览文件 @
9729edac
...
...
@@ -26,6 +26,7 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
framework
{
class
Scope
{
public:
Scope
()
=
default
;
...
...
src/framework/tensor.h
浏览文件 @
9729edac
...
...
@@ -226,7 +226,6 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
}
}
#endif
return
printer
;
}
...
...
src/io/api_paddle_mobile.cc
浏览文件 @
9729edac
...
...
@@ -18,17 +18,17 @@
namespace
paddle_mobile
{
template
<
typename
D
type
,
Precision
P
>
PaddleMobilePredictor
<
D
type
,
P
>::
PaddleMobilePredictor
(
template
<
typename
D
evice
,
typename
T
>
PaddleMobilePredictor
<
D
evice
,
T
>::
PaddleMobilePredictor
(
const
PaddleMobileConfig
&
config
)
{
PADDLE_MOBILE_ENFORCE
(
Init
(
config
)
==
true
,
"paddle mobile predictor init failed!"
);
config_
=
config
;
}
template
<
typename
D
type
,
Precision
P
>
bool
PaddleMobilePredictor
<
D
type
,
P
>::
Init
(
const
PaddleMobileConfig
&
config
)
{
paddle_mobile_
.
reset
(
new
PaddleMobile
<
D
type
,
P
>
());
template
<
typename
D
evice
,
typename
T
>
bool
PaddleMobilePredictor
<
D
evice
,
T
>::
Init
(
const
PaddleMobileConfig
&
config
)
{
paddle_mobile_
.
reset
(
new
PaddleMobile
<
D
evice
,
T
>
());
#ifdef PADDLE_MOBILE_CL
paddle_mobile_
->
SetCLPath
(
config
.
cl_path
);
#endif
...
...
@@ -52,8 +52,8 @@ bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
paddle_mobile_
->
SetThreadNum
(
config
.
thread_num
);
return
true
;
}
template
<
typename
D
type
,
Precision
P
>
bool
PaddleMobilePredictor
<
D
type
,
P
>::
Run
(
template
<
typename
D
evice
,
typename
T
>
bool
PaddleMobilePredictor
<
D
evice
,
T
>::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
if
(
inputs
.
empty
())
{
...
...
@@ -78,12 +78,12 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
framework
::
Tensor
input_tensor
;
input_tensor
.
Resize
(
ddim
);
int
input_length
=
framework
::
product
(
ddim
);
typedef
typename
PrecisionTrait
<
P
>::
ptype
PType
;
auto
input_ptr
=
input_tensor
.
mutable_data
<
PType
>
();
auto
input_ptr
=
input_tensor
.
mutable_data
<
T
>
();
memcpy
(
input_ptr
,
static_cast
<
PType
*>
(
input
.
data
.
data
()),
input_length
*
sizeof
(
PType
));
auto
output_tensor
=
paddle_mobile_
->
Predict
(
input_tensor
);
memcpy
(
input_ptr
,
static_cast
<
T
*>
(
input
.
data
.
data
()),
input_length
*
sizeof
(
T
));
paddle_mobile_
->
Predict
(
input_tensor
);
auto
output_tensor
=
paddle_mobile_
->
Fetch
();
if
(
output_data
->
empty
())
{
LOG
(
kLOG_ERROR
)
<<
"At least one output should be set with tensors' names."
;
...
...
@@ -99,18 +99,18 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
output
.
shape
.
push_back
(
static_cast
<
int
>
(
d
));
}
if
(
output
.
data
.
length
()
<
output_length
*
sizeof
(
PType
))
{
output
.
data
.
Resize
(
output_length
*
sizeof
(
PType
));
if
(
output
.
data
.
length
()
<
output_length
*
sizeof
(
T
))
{
output
.
data
.
Resize
(
output_length
*
sizeof
(
T
));
}
memcpy
(
output
.
data
.
data
(),
output_tensor
->
template
data
<
PType
>(),
output_length
*
sizeof
(
PType
));
memcpy
(
output
.
data
.
data
(),
output_tensor
->
template
data
<
T
>(),
output_length
*
sizeof
(
T
));
return
true
;
}
template
<
typename
D
type
,
Precision
P
>
PaddleMobilePredictor
<
D
type
,
P
>::~
PaddleMobilePredictor
()
{
template
<
typename
D
evice
,
typename
T
>
PaddleMobilePredictor
<
D
evice
,
T
>::~
PaddleMobilePredictor
()
{
paddle_mobile_
->
Clear
();
}
...
...
@@ -122,13 +122,13 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
std
::
unique_ptr
<
PaddlePredictor
>
x
;
if
(
config
.
precision
==
PaddleMobileConfig
::
FP32
)
{
if
(
config
.
device
==
PaddleMobileConfig
::
kCPU
)
{
x
.
reset
(
new
PaddleMobilePredictor
<
CPU
,
Precision
::
FP32
>
(
config
));
x
.
reset
(
new
PaddleMobilePredictor
<
CPU
,
float
>
(
config
));
}
else
if
(
config
.
device
==
PaddleMobileConfig
::
kFPGA
)
{
x
.
reset
(
new
PaddleMobilePredictor
<
FPGA
,
Precision
::
FP32
>
(
config
));
x
.
reset
(
new
PaddleMobilePredictor
<
FPGA
,
float
>
(
config
));
}
else
if
(
config
.
device
==
PaddleMobileConfig
::
kGPU_MALI
)
{
x
.
reset
(
new
PaddleMobilePredictor
<
GPU_MALI
,
Precision
::
FP32
>
(
config
));
x
.
reset
(
new
PaddleMobilePredictor
<
GPU_MALI
,
float
>
(
config
));
}
else
if
(
config
.
device
==
PaddleMobileConfig
::
kGPU_CL
)
{
x
.
reset
(
new
PaddleMobilePredictor
<
GPU_CL
,
Precision
::
FP32
>
(
config
));
x
.
reset
(
new
PaddleMobilePredictor
<
GPU_CL
,
float
>
(
config
));
}
else
{
LOG
(
kLOG_ERROR
)
<<
"unsupport device type!"
;
return
nullptr
;
...
...
src/io/api_paddle_mobile.h
浏览文件 @
9729edac
...
...
@@ -29,7 +29,7 @@ limitations under the License. */
namespace
paddle_mobile
{
template
<
typename
D
type
=
CPU
,
Precision
P
=
Precision
::
FP32
>
template
<
typename
D
evice
=
CPU
,
typename
T
=
float
>
class
PaddleMobilePredictor
:
public
PaddlePredictor
{
public:
PaddleMobilePredictor
()
=
delete
;
...
...
@@ -43,7 +43,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
~
PaddleMobilePredictor
()
override
;
private:
std
::
unique_ptr
<
PaddleMobile
<
D
type
,
P
>>
paddle_mobile_
;
std
::
unique_ptr
<
PaddleMobile
<
D
evice
,
T
>>
paddle_mobile_
;
bool
Init
(
const
PaddleMobileConfig
&
config
);
PaddleMobileConfig
config_
;
...
...
src/io/ios_io/PaddleMobileCPU.mm
浏览文件 @
9729edac
...
...
@@ -48,7 +48,7 @@
@interface
PaddleMobileCPU
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
,
paddle_mobile
::
Precision
::
FP32
>
*
pam_
;
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
,
float
>
*
pam_
;
BOOL
loaded_
;
}
@end
...
...
@@ -59,7 +59,7 @@ static std::mutex shared_mutex;
-
(
instancetype
)
init
{
if
(
self
=
[
super
init
])
{
pam_
=
new
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
,
paddle_mobile
::
Precision
::
FP32
>
();
pam_
=
new
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
,
float
>
();
}
return
self
;
}
...
...
@@ -220,7 +220,8 @@ static std::mutex shared_mutex;
memcpy
(
input_ptr
,
input
,
numel
*
sizeof
(
float
));
std
::
shared_ptr
<
paddle_mobile
::
framework
::
Tensor
>
output
=
pam_
->
Predict
(
input_tensor
);
pam_
->
Predict
(
input_tensor
);
std
::
shared_ptr
<
paddle_mobile
::
framework
::
Tensor
>
output
=
pam_
->
Fetch
();
float
*
output_pointer
=
new
float
[
output
->
numel
()];
...
...
src/io/jni/paddle_mobile_jni.cpp
浏览文件 @
9729edac
...
...
@@ -16,21 +16,23 @@ limitations under the License. */
#include "paddle_mobile_jni.h"
#include <cmath>
#include <string>
#include <vector>
#include "common/log.h"
#include "framework/tensor.h"
#include "io/paddle_mobile.h"
#ifdef ENABLE_EXCEPTION
#include "common/enforce.h"
#endif
#ifdef __cplusplus
extern
"C"
{
#endif
namespace
paddle_mobile
{
namespace
jni
{
using
framework
::
DDim
;
using
framework
::
Program
;
using
framework
::
Tensor
;
...
...
@@ -200,7 +202,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
for
(
int
i
=
0
;
i
<
length
;
i
++
)
{
input_ptr
[
i
]
=
dataPointer
[
i
];
}
auto
output
=
getPaddleMobileInstance
()
->
Predict
(
input
);
getPaddleMobileInstance
()
->
Predict
(
input
);
auto
output
=
getPaddleMobileInstance
()
->
Fetch
();
count
=
output
->
numel
();
result
=
env
->
NewFloatArray
(
count
);
env
->
SetFloatArrayRegion
(
result
,
0
,
count
,
output
->
data
<
float
>
());
...
...
@@ -233,7 +236,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
for
(
int
i
=
0
;
i
<
length
;
i
++
)
{
input_ptr
[
i
]
=
dataPointer
[
i
];
}
auto
output
=
getPaddleMobileInstance
()
->
Predict
(
input
);
getPaddleMobileInstance
()
->
Predict
(
input
);
auto
output
=
getPaddleMobileInstance
()
->
Fetch
();
count
=
output
->
numel
();
result
=
env
->
NewFloatArray
(
count
);
env
->
SetFloatArrayRegion
(
result
,
0
,
count
,
output
->
data
<
float
>
());
...
...
@@ -328,7 +332,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
for
(
int
i
=
0
;
i
<
length
;
i
++
)
{
input_ptr
[
i
]
=
matrix
[
i
];
}
auto
output
=
getPaddleMobileInstance
()
->
Predict
(
input
);
getPaddleMobileInstance
()
->
Predict
(
input
);
auto
output
=
getPaddleMobileInstance
()
->
Fetch
();
count
=
output
->
numel
();
result
=
env
->
NewFloatArray
(
count
);
env
->
SetFloatArrayRegion
(
result
,
0
,
count
,
output
->
data
<
float
>
());
...
...
@@ -363,7 +368,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
for
(
int
i
=
0
;
i
<
length
;
i
++
)
{
input_ptr
[
i
]
=
matrix
[
i
];
}
auto
output
=
getPaddleMobileInstance
()
->
Predict
(
input
);
getPaddleMobileInstance
()
->
Predict
(
input
);
auto
output
=
getPaddleMobileInstance
()
->
Fetch
();
count
=
output
->
numel
();
result
=
env
->
NewFloatArray
(
count
);
env
->
SetFloatArrayRegion
(
result
,
0
,
count
,
output
->
data
<
float
>
());
...
...
@@ -399,7 +405,8 @@ Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
auto
*
pdata
=
words
.
mutable_data
<
int64_t
>
();
size_t
n
=
words
.
numel
()
*
sizeof
(
int64_t
);
memcpy
(
pdata
,
ids
.
data
(),
n
);
auto
vec_result
=
paddle_mobile
.
PredictLod
(
words
);
paddle_mobile
.
Predict
(
words
);
auto
vec_result
=
paddle_mobile
.
Fetch
();
int
count
=
vec_result
->
numel
();
jlongArray
result
=
NULL
;
ANDROIDLOGE
(
"predict nlp size %d"
,
count
);
...
...
src/io/paddle_mobile.cpp
浏览文件 @
9729edac
...
...
@@ -13,81 +13,81 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "io/paddle_mobile.h"
#include <utility>
#include "common/common.h"
#ifdef PADDLE_MOBILE_CL
#include <CL/cl.h>
#include "framework/cl/cl_tensor.h"
#endif
#include "common/common.h"
#include "operators/math/gemm.h"
namespace
paddle_mobile
{
template
<
typename
D
type
,
Precision
P
>
void
PaddleMobile
<
D
type
,
P
>::
SetThreadNum
(
int
num
)
{
template
<
typename
D
evice
,
typename
T
>
void
PaddleMobile
<
D
evice
,
T
>::
SetThreadNum
(
int
num
)
{
#ifdef _OPENMP
omp_set_num_threads
(
num
);
#endif
}
template
<
typename
D
type
,
Precision
P
>
bool
PaddleMobile
<
Dtype
,
P
>::
Load
(
const
std
::
string
&
dirname
,
bool
optimiz
e
,
bool
quantification
,
int
batch_size
,
bool
loddable
)
{
template
<
typename
D
evice
,
typename
T
>
PMStatus
PaddleMobile
<
Device
,
T
>::
Load
(
const
std
::
string
&
dirnam
e
,
bool
optimize
,
bool
quantification
,
int
batch_size
,
bool
loddable
)
{
if
(
loader_
.
get
()
==
nullptr
)
{
loader_
=
std
::
make_shared
<
framework
::
Loader
<
D
type
,
P
>>
();
loader_
=
std
::
make_shared
<
framework
::
Loader
<
D
evice
,
T
>>
();
}
else
{
LOG
(
kLOG_INFO
)
<<
"loader inited"
;
}
if
(
executor_
.
get
()
==
nullptr
)
{
executor_
=
std
::
make_shared
<
framework
::
Executor
<
D
type
,
P
>>
(
executor_
=
std
::
make_shared
<
framework
::
Executor
<
D
evice
,
T
>>
(
loader_
->
Load
(
dirname
,
optimize
,
quantification
),
batch_size
,
optimize
,
loddable
);
}
else
{
LOG
(
kLOG_INFO
)
<<
"executor inited"
;
}
return
true
;
return
PMSuccess
;
}
template
<
typename
D
type
,
Precision
P
>
bool
PaddleMobile
<
Dtype
,
P
>::
Load
(
const
std
::
string
&
model_path
,
const
std
::
string
&
para_path
,
bool
optimize
,
bool
quantification
,
int
batch_size
,
bool
loddable
)
{
template
<
typename
D
evice
,
typename
T
>
PMStatus
PaddleMobile
<
Device
,
T
>::
Load
(
const
std
::
string
&
model_path
,
const
std
::
string
&
para_path
,
bool
optimize
,
bool
quantification
,
int
batch_size
,
bool
loddable
)
{
if
(
loader_
.
get
()
==
nullptr
)
{
loader_
=
std
::
make_shared
<
framework
::
Loader
<
D
type
,
P
>>
();
loader_
=
std
::
make_shared
<
framework
::
Loader
<
D
evice
,
T
>>
();
}
else
{
LOG
(
kLOG_INFO
)
<<
"loader inited"
;
}
if
(
executor_
.
get
()
==
nullptr
)
{
executor_
=
std
::
make_shared
<
framework
::
Executor
<
D
type
,
P
>>
(
executor_
=
std
::
make_shared
<
framework
::
Executor
<
D
evice
,
T
>>
(
loader_
->
Load
(
model_path
,
para_path
,
optimize
,
quantification
),
batch_size
,
optimize
,
loddable
);
}
else
{
LOG
(
kLOG_INFO
)
<<
"executor inited"
;
}
return
true
;
return
PMSuccess
;
}
template
<
typename
D
type
,
Precision
P
>
bool
PaddleMobile
<
D
type
,
P
>::
LoadCombinedMemory
(
size_t
model_len
,
const
uint8_t
*
model_buf
,
size_t
combined_params_len
,
uint8_t
*
combined_params_buf
)
{
template
<
typename
D
evice
,
typename
T
>
bool
PaddleMobile
<
D
evice
,
T
>::
LoadCombinedMemory
(
size_t
model_len
,
const
uint8_t
*
model_buf
,
size_t
combined_params_len
,
uint8_t
*
combined_params_buf
)
{
int
batch_size
=
1
;
bool
optimise
=
true
;
bool
quantification
=
false
;
if
(
loader_
.
get
()
==
nullptr
)
{
loader_
=
std
::
make_shared
<
framework
::
Loader
<
D
type
,
P
>>
();
loader_
=
std
::
make_shared
<
framework
::
Loader
<
D
evice
,
T
>>
();
}
else
{
LOG
(
kLOG_INFO
)
<<
"loader inited"
;
}
if
(
executor_
.
get
()
==
nullptr
)
{
executor_
=
std
::
make_shared
<
framework
::
Executor
<
D
type
,
P
>>
(
executor_
=
std
::
make_shared
<
framework
::
Executor
<
D
evice
,
T
>>
(
loader_
->
LoadCombinedMemory
(
model_len
,
model_buf
,
combined_params_len
,
combined_params_buf
,
optimise
,
quantification
),
...
...
@@ -96,38 +96,76 @@ bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
LOG
(
kLOG_INFO
)
<<
"executor inited"
;
}
return
true
;
return
PMSuccess
;
}
template
<
typename
Device
,
typename
T
>
PMStatus
PaddleMobile
<
Device
,
T
>::
Predict
(
const
framework
::
Tensor
&
input
)
{
std
::
vector
<
std
::
pair
<
std
::
string
,
framework
::
Tensor
>>
inputs
;
inputs
.
push_back
(
std
::
make_pair
(
"feed"
,
input
));
return
this
->
Predict
(
inputs
);
}
template
<
typename
Dtype
,
Precision
P
>
std
::
shared_ptr
<
framework
::
Tensor
>
PaddleMobile
<
Dtype
,
P
>::
Predict
(
const
framework
::
Tensor
&
t
)
{
return
executor_
->
Predict
(
t
);
template
<
typename
Device
,
typename
T
>
PMStatus
PaddleMobile
<
Device
,
T
>::
Predict
(
const
framework
::
LoDTensor
&
input
)
{
std
::
vector
<
std
::
pair
<
std
::
string
,
framework
::
LoDTensor
>>
inputs
;
inputs
.
push_back
(
std
::
make_pair
(
"feed"
,
input
));
return
this
->
Predict
(
inputs
);
}
template
<
typename
Device
,
typename
T
>
PMStatus
PaddleMobile
<
Device
,
T
>::
Predict
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
framework
::
Tensor
>>
&
inputs
)
{
return
executor_
->
Predict
(
inputs
);
}
template
<
typename
D
type
,
Precision
P
>
std
::
shared_ptr
<
framework
::
Tensor
>
PaddleMobile
<
Dtype
,
P
>::
PredictLod
(
const
framework
::
LoDTensor
&
t
)
{
return
executor_
->
Predict
Lod
(
t
);
template
<
typename
D
evice
,
typename
T
>
PMStatus
PaddleMobile
<
Device
,
T
>::
Predict
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
framework
::
LoDTensor
>>
&
inputs
)
{
return
executor_
->
Predict
(
inputs
);
}
template
<
typename
Dtype
,
Precision
P
>
std
::
vector
<
typename
PaddleMobile
<
Dtype
,
P
>::
Ptype
>
PaddleMobile
<
Dtype
,
P
>::
Predict
(
const
std
::
vector
<
Ptype
>
&
input
,
const
std
::
vector
<
int64_t
>
&
dims
)
{
template
<
typename
Device
,
typename
T
>
std
::
vector
<
T
>
PaddleMobile
<
Device
,
T
>::
Predict
(
const
std
::
vector
<
T
>
&
input
,
const
std
::
vector
<
int64_t
>
&
dims
)
{
return
executor_
->
Predict
(
input
,
dims
);
}
template
<
typename
Dtype
,
Precision
P
>
void
PaddleMobile
<
Dtype
,
P
>::
Clear
()
{
template
<
typename
Device
,
typename
T
>
PMStatus
PaddleMobile
<
Device
,
T
>::
Predict
()
{
return
executor_
->
Predict
();
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobile
<
Device
,
T
>::
Feed
(
const
framework
::
Tensor
&
input
,
const
std
::
string
&
var_name
)
{
executor_
->
SetInput
(
input
,
var_name
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobile
<
Device
,
T
>::
Feed
(
const
framework
::
LoDTensor
&
input
,
const
std
::
string
&
var_name
)
{
executor_
->
SetInput
(
input
,
var_name
);
}
typedef
std
::
shared_ptr
<
framework
::
LoDTensor
>
LoDTensorPtr
;
template
<
typename
Device
,
typename
T
>
LoDTensorPtr
PaddleMobile
<
Device
,
T
>::
Fetch
(
const
std
::
string
&
var_name
)
{
return
executor_
->
GetOutput
(
var_name
);
}
template
<
typename
Device
,
typename
T
>
void
PaddleMobile
<
Device
,
T
>::
Clear
()
{
executor_
=
nullptr
;
loader_
=
nullptr
;
}
template
<
typename
Dtype
,
Precision
P
>
double
PaddleMobile
<
Dtype
,
P
>::
GetPredictTime
()
{}
template
<
typename
Device
,
typename
T
>
double
PaddleMobile
<
Device
,
T
>::
GetPredictTime
()
{}
#ifdef PADDLE_MOBILE_CPU
template
<
>
double
PaddleMobile
<
CPU
,
Precision
::
FP32
>::
GetPredictTime
()
{
double
PaddleMobile
<
CPU
,
float
>::
GetPredictTime
()
{
int
m
=
32
;
int
n
=
224
*
224
;
int
k
=
27
;
...
...
@@ -148,7 +186,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
for
(
int
i
=
0
;
i
<
k
*
n
;
++
i
)
{
b
[
i
]
=
t1
+
rand
()
%
t2
;
// NOLINT
}
paddle_mobile
::
operators
::
math
::
Gemm
gemm
;
operators
::
math
::
Gemm
gemm
;
auto
time1
=
paddle_mobile
::
time
();
gemm
.
Sgemm
(
m
,
n
,
k
,
static_cast
<
float
>
(
1
),
a
,
lda
,
b
,
ldb
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
false
,
...
...
@@ -162,57 +201,51 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
}
#endif
template
<
typename
Dtype
,
Precision
P
>
PaddleMobile
<
Dtype
,
P
>::~
PaddleMobile
()
{
executor_
=
nullptr
;
loader_
=
nullptr
;
}
#ifdef PADDLE_MOBILE_FPGA
template
<
typename
Dtype
,
Precision
P
>
void
PaddleMobile
<
Dtype
,
P
>::
InjectVariable
(
const
framework
::
Tensor
&
t
,
std
::
string
var_name
)
{
template
<
typename
Device
,
T
P
>
void
PaddleMobile
<
Device
,
P
>::
InjectVariable
(
const
framework
::
Tensor
&
t
,
std
::
string
var_name
)
{
executor_
->
InjectVariable
(
t
,
var_name
);
}
template
<
typename
D
type
,
Precision
P
>
void
PaddleMobile
<
D
typ
e
,
P
>::
FeedData
(
const
framework
::
Tensor
&
t
)
{
template
<
typename
D
evice
,
T
P
>
void
PaddleMobile
<
D
evic
e
,
P
>::
FeedData
(
const
framework
::
Tensor
&
t
)
{
executor_
->
FeedData
(
t
);
}
template
<
typename
Dtype
,
Precision
P
>
std
::
shared_ptr
<
framework
::
Tensor
>
PaddleMobile
<
Dtype
,
P
>::
FetchResult
(
int
id
)
{
template
<
typename
Device
,
T
P
>
std
::
shared_ptr
<
framework
::
Tensor
>
PaddleMobile
<
Device
,
P
>::
FetchResult
(
int
id
)
{
return
executor_
->
FetchResult
(
id
);
}
template
<
typename
D
type
,
Precision
P
>
void
PaddleMobile
<
D
typ
e
,
P
>::
Predict_From_To
(
int
start
,
int
end
)
{
template
<
typename
D
evice
,
T
P
>
void
PaddleMobile
<
D
evic
e
,
P
>::
Predict_From_To
(
int
start
,
int
end
)
{
executor_
->
Predict_From_To
(
start
,
end
);
}
template
<
typename
D
type
,
Precision
P
>
void
PaddleMobile
<
D
typ
e
,
P
>::
Predict_From
(
int
start
)
{
template
<
typename
D
evice
,
T
P
>
void
PaddleMobile
<
D
evic
e
,
P
>::
Predict_From
(
int
start
)
{
executor_
->
Predict_From
(
start
);
}
template
<
typename
D
type
,
Precision
P
>
void
PaddleMobile
<
D
typ
e
,
P
>::
Predict_To
(
int
end
)
{
template
<
typename
D
evice
,
T
P
>
void
PaddleMobile
<
D
evic
e
,
P
>::
Predict_To
(
int
end
)
{
executor_
->
Predict_To
(
end
);
}
#endif
#ifdef PADDLE_MOBILE_CL
static
std
::
mutex
lc
;
template
<
typename
D
type
,
Precision
P
>
void
PaddleMobile
<
D
typ
e
,
P
>::
SetCLPath
(
std
::
string
path
)
{
template
<
typename
D
evice
,
T
P
>
void
PaddleMobile
<
D
evic
e
,
P
>::
SetCLPath
(
std
::
string
path
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
lc
);
if
(
framework
::
CLEngine
::
Instance
()
->
GetCLPath
()
==
""
)
{
framework
::
CLEngine
::
Instance
()
->
setClPath
(
path
);
}
}
template
<
>
double
PaddleMobile
<
GPU_CL
,
Precision
::
FP32
>::
GetPredictTime
()
{
double
PaddleMobile
<
GPU_CL
,
T
::
FP32
>::
GetPredictTime
()
{
cl_int
status
;
cl_uint
nPlatform
;
clGetPlatformIDs
(
0
,
NULL
,
&
nPlatform
);
...
...
@@ -410,8 +443,8 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
return
-
1
;
}
}
template
<
typename
D
type
,
Precision
P
>
int
PaddleMobile
<
D
typ
e
,
P
>::
readText
(
template
<
typename
D
evice
,
T
P
>
int
PaddleMobile
<
D
evic
e
,
P
>::
readText
(
const
char
*
kernelPath
,
char
**
pcode
)
{
// 读取文本文件放入 pcode,返回字符串长度
FILE
*
fp
;
...
...
@@ -440,13 +473,11 @@ int PaddleMobile<Dtype, P>::readText(
fclose
(
fp
);
return
size
+
1
;
}
#endif
template
class
PaddleMobile
<
CPU
,
Precision
::
FP32
>;
template
class
PaddleMobile
<
FPGA
,
Precision
::
FP32
>;
template
class
PaddleMobile
<
GPU_MALI
,
Precision
::
FP32
>;
template
class
PaddleMobile
<
GPU_CL
,
Precision
::
FP32
>;
template
class
PaddleMobile
<
CPU
,
float
>;
template
class
PaddleMobile
<
FPGA
,
float
>;
template
class
PaddleMobile
<
GPU_MALI
,
float
>;
template
class
PaddleMobile
<
GPU_CL
,
float
>;
}
// namespace paddle_mobile
src/io/paddle_mobile.h
浏览文件 @
9729edac
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory>
#include <string>
#include <utility>
#include <vector>
#ifdef _OPENMP
#include <omp.h>
...
...
@@ -32,43 +33,52 @@ limitations under the License. */
namespace
paddle_mobile
{
template
<
typename
D
type
=
CPU
,
Precision
P
=
Precision
::
FP32
>
template
<
typename
D
evice
,
typename
T
=
float
>
class
PaddleMobile
{
typedef
typename
PrecisionTrait
<
P
>::
ptype
Ptype
;
public:
PaddleMobile
()
{
#ifndef PADDLE_MOBILE_CL
bool
is_gpu
=
std
::
is_same
<
DeviceType
<
kGPU_CL
>
,
Dtype
>::
value
;
PADDLE_MOBILE_ENFORCE
(
!
is_gpu
,
"Not Enable GPU in CmakeList but run gpu codes "
);
bool
is_gpu
=
std
::
is_same
<
DeviceType
<
kGPU_CL
>
,
Device
>::
value
;
PADDLE_MOBILE_ENFORCE
(
!
is_gpu
,
"Please recompile with GPU_CL is on"
);
#endif
}
bool
Load
(
const
std
::
string
&
dirname
,
bool
optimize
=
false
,
bool
quantification
=
false
,
int
batch_size
=
1
,
bool
loddable
=
false
);
~
PaddleMobile
()
{}
PMStatus
Load
(
const
std
::
string
&
dirname
,
const
bool
optimize
=
false
,
const
bool
quantification
=
false
,
const
int
batch_size
=
1
,
const
bool
lod
=
false
);
PMStatus
Load
(
const
std
::
string
&
model_path
,
const
std
::
string
&
para_path
,
const
bool
optimize
=
false
,
const
bool
quantification
=
false
,
const
int
batch_size
=
1
,
const
bool
lod
=
false
);
PMStatus
Predict
(
const
framework
::
Tensor
&
input
);
PMStatus
Predict
(
const
framework
::
LoDTensor
&
input
);
bool
Load
(
const
std
::
string
&
model_path
,
const
std
::
string
&
para_path
,
bool
optimize
=
false
,
bool
quantification
=
false
,
int
batch_size
=
1
,
bool
loddable
=
false
);
PMStatus
Predict
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
framework
::
Tensor
>>
&
inputs
);
PMStatus
Predict
(
const
std
::
vector
<
std
::
pair
<
std
::
string
,
framework
::
LoDTensor
>>
&
inputs
);
std
::
shared_ptr
<
framework
::
Tensor
>
Predict
(
const
framework
::
Tensor
&
t
);
std
::
vector
<
T
>
Predict
(
const
std
::
vector
<
T
>
&
input
,
const
std
::
vector
<
int64_t
>
&
dims
);
PMStatus
Predict
();
std
::
shared_ptr
<
framework
::
Tensor
>
PredictLod
(
const
framework
::
LoDTensor
&
t
);
void
Feed
(
const
framework
::
LoDTensor
&
input
,
const
std
::
string
&
var_name
);
void
Feed
(
const
framework
::
Tensor
&
input
,
const
std
::
string
&
var_name
);
std
::
vector
<
Ptype
>
Predict
(
const
std
::
vector
<
Ptype
>
&
input
,
const
std
::
vector
<
int64_t
>
&
dims
);
typedef
std
::
shared_ptr
<
framework
::
LoDTensor
>
LoDTensorPtr
;
LoDTensorPtr
Fetch
(
const
std
::
string
&
var_name
);
LoDTensorPtr
Fetch
()
{
return
Fetch
(
"fetch"
);
}
bool
LoadCombinedMemory
(
size_t
model_len
,
const
uint8_t
*
model_buf
,
size_t
combined_params_len
,
uint8_t
*
combined_params_buf
);
void
SetThreadNum
(
int
num
);
void
SetThreadNum
(
int
count
);
void
Clear
();
double
GetPredictTime
();
~
PaddleMobile
();
#ifdef PADDLE_MOBILE_FPGA
void
InjectVariable
(
const
framework
::
Tensor
&
t
,
std
::
string
var_name
);
void
FeedData
(
const
framework
::
Tensor
&
t
);
...
...
@@ -79,15 +89,15 @@ class PaddleMobile {
#endif
#ifdef PADDLE_MOBILE_CL
public:
public:
// NOLINT
void
SetCLPath
(
std
::
string
cl_path
);
int
readText
(
const
char
*
kernelPath
,
char
**
pcode
);
// 读取文本文件放入 pcode,返回字符串长度
#endif
private:
std
::
shared_ptr
<
framework
::
Loader
<
D
type
,
P
>>
loader_
;
std
::
shared_ptr
<
framework
::
Executor
<
D
type
,
P
>>
executor_
;
std
::
shared_ptr
<
framework
::
Loader
<
D
evice
,
T
>>
loader_
;
std
::
shared_ptr
<
framework
::
Executor
<
D
evice
,
T
>>
executor_
;
};
}
// namespace paddle_mobile
src/io/paddle_test_inference_api.cpp
浏览文件 @
9729edac
...
...
@@ -14,10 +14,12 @@ limitations under the License. */
#include "io/paddle_test_inference_api.h"
#include "io/paddle_mobile.h"
namespace
paddle_mobile
{
template
<
typename
Dtype
,
Precision
P
>
double
PaddleTester
<
Dtype
,
P
>::
CaculatePredictTime
(
std
::
string
*
cl_path
)
{
PaddleMobile
<
Dtype
,
P
>
paddle_mobile
;
template
<
typename
Device
,
typename
T
>
double
PaddleTester
<
Device
,
T
>::
CaculatePredictTime
(
std
::
string
*
cl_path
)
{
PaddleMobile
<
Device
,
T
>
paddle_mobile
;
#ifdef PADDLE_MOBILE_CL
if
(
cl_path
)
{
paddle_mobile
.
SetCLPath
(
*
cl_path
);
...
...
@@ -26,10 +28,10 @@ double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) {
#endif
return
paddle_mobile
.
GetPredictTime
();
}
template
class
PaddleTester
<
CPU
,
Precision
::
FP32
>;
template
class
PaddleTester
<
FPGA
,
Precision
::
FP32
>;
template
class
PaddleTester
<
GPU_MALI
,
Precision
::
FP32
>;
template
class
PaddleTester
<
CPU
,
float
>;
template
class
PaddleTester
<
FPGA
,
float
>;
template
class
PaddleTester
<
GPU_MALI
,
float
>;
template
class
PaddleTester
<
GPU_CL
,
Precision
::
FP32
>;
template
class
PaddleTester
<
GPU_CL
,
float
>;
}
// namespace paddle_mobile
src/io/paddle_test_inference_api.h
浏览文件 @
9729edac
...
...
@@ -20,10 +20,13 @@ limitations under the License. */
*/
#pragma once
#include "common/types.h"
#include "string"
namespace
paddle_mobile
{
template
<
typename
Dtype
,
Precision
P
=
Precision
::
FP32
>
template
<
typename
Device
,
typename
T
=
float
>
class
PaddleTester
{
public:
double
CaculatePredictTime
(
std
::
string
*
cl_path
=
nullptr
);
...
...
test/CMakeLists.txt
浏览文件 @
9729edac
...
...
@@ -375,5 +375,8 @@ if (NOT FOUND_MATCH)
# gen test
ADD_EXECUTABLE
(
test-super net/test_super.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-super paddle-mobile
)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
# gen test
ADD_EXECUTABLE
(
test-ocr net/test_ocr.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-ocr paddle-mobile
)
endif
()
test/executor_for_test.h
浏览文件 @
9729edac
...
...
@@ -39,6 +39,7 @@ using paddle_mobile::framework::Tensor;
using
paddle_mobile
::
framework
::
Variable
;
using
std
::
string
;
using
std
::
vector
;
template
<
typename
DeviceType
,
typename
OpType
>
class
Executor4Test
:
public
Executor
<
DeviceType
>
{
public:
...
...
@@ -48,20 +49,19 @@ class Executor4Test : public Executor<DeviceType> {
this
->
use_optimize_
=
use_optimize
;
this
->
program_
=
p
;
if
(
this
->
use_optimize_
)
{
this
->
to_predict_program
_
=
this
->
program_
.
optimizeProgram
;
this
->
program_desc
_
=
this
->
program_
.
optimizeProgram
;
}
else
{
this
->
to_predict_program
_
=
this
->
program_
.
originProgram
;
this
->
program_desc
_
=
this
->
program_
.
originProgram
;
}
if
(
this
->
program_
.
originProgram
==
nullptr
)
{
LOG
(
paddle_mobile
::
LogLevel
::
kLOG_ERROR
)
<<
"to_predict_program_ == nullptr"
;
LOG
(
paddle_mobile
::
LogLevel
::
kLOG_ERROR
)
<<
"program_desc_ == nullptr"
;
}
const
std
::
vector
<
std
::
shared_ptr
<
BlockDesc
>>
blocks
=
this
->
to_predict_program
_
->
Blocks
();
for
(
std
::
shared_ptr
<
BlockDesc
>
block_desc
:
blocks
)
{
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block
_desc
->
Ops
();
this
->
program_desc
_
->
Blocks
();
for
(
int
block_id
=
0
;
block_id
<
blocks
.
size
();
++
block_id
)
{
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block
s
[
block_id
]
->
Ops
();
for
(
int
i
=
0
;
i
<
ops
.
size
();
++
i
)
{
auto
op
=
ops
[
i
];
if
(
op
->
Type
()
==
op_type
)
{
...
...
@@ -73,18 +73,16 @@ class Executor4Test : public Executor<DeviceType> {
paddle_mobile
::
framework
::
OpRegistry
<
DeviceType
>::
CreateOp
(
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
op
->
GetAttrMap
(),
this
->
program_
.
scope
);
this
->
ops_of_block_
[
*
block_desc
.
get
()
].
push_back
(
op_ptr
);
this
->
ops_of_block_
[
block_id
].
push_back
(
op_ptr
);
break
;
}
}
}
this
->
InitMemory
();
std
::
shared_ptr
<
paddle_mobile
::
framework
::
BlockDesc
>
to_predict_block
=
this
->
to_predict_program_
->
Block
(
0
);
auto
&
ops
=
this
->
ops_of_block_
[
*
to_predict_block
.
get
()];
for
(
const
auto
&
op
:
ops
)
{
op
->
Init
();
for
(
const
auto
&
ops
:
this
->
ops_of_block_
)
{
for
(
const
auto
&
op
:
ops
)
{
op
->
Init
();
}
}
}
...
...
@@ -117,12 +115,10 @@ class Executor4Test : public Executor<DeviceType> {
output_tensor_sptrs
[
i
].
reset
(
output_tensors
[
i
]);
}
std
::
shared_ptr
<
paddle_mobile
::
framework
::
BlockDesc
>
to_predict_block
=
this
->
to_predict_program_
->
Block
(
0
);
for
(
int
j
=
0
;
j
<
this
->
ops_of_block_
[
*
to_predict_block
.
get
()].
size
();
++
j
)
{
auto
op
=
this
->
ops_of_block_
[
*
to_predict_block
.
get
()][
j
];
op
->
Run
();
for
(
auto
&
ops
:
this
->
ops_of_block_
)
{
for
(
auto
&
op
:
ops
)
{
op
->
Run
();
}
}
return
output_tensor_sptrs
;
...
...
@@ -139,14 +135,11 @@ class Executor4Test : public Executor<DeviceType> {
auto
*
output_tensor
=
con_output
->
GetMutable
<
LoDTensor
>
();
output_tensor
->
mutable_data
<
float
>
(
dDim
);
std
::
shared_ptr
<
paddle_mobile
::
framework
::
BlockDesc
>
to_predict_block
=
this
->
to_predict_program_
->
Block
(
0
);
for
(
int
j
=
0
;
j
<
this
->
ops_of_block_
[
*
to_predict_block
.
get
()].
size
();
++
j
)
{
auto
op
=
this
->
ops_of_block_
[
*
to_predict_block
.
get
()][
j
];
op
->
Run
();
for
(
auto
&
ops
:
this
->
ops_of_block_
)
{
for
(
auto
&
op
:
ops
)
{
op
->
Run
();
}
}
return
std
::
make_shared
<
paddle_mobile
::
framework
::
Tensor
>
(
paddle_mobile
::
framework
::
Tensor
(
*
output_tensor
));
}
...
...
test/net/test_benchmark.cpp
浏览文件 @
9729edac
...
...
@@ -52,15 +52,16 @@ int main(int argc, char* argv[]) {
SetupTensor
<
float
>
(
&
input
,
in_shape
,
0.
f
,
255.
f
);
// warmup
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
output
=
paddle_mobile
.
Predict
(
input
);
paddle_mobile
.
Predict
(
input
);
}
auto
time3
=
time
();
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
output
=
paddle_mobile
.
Predict
(
input
);
paddle_mobile
.
Predict
(
input
);
}
auto
time4
=
time
();
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
/
10
<<
"ms
\n
"
;
std
::
ostringstream
os
(
"output tensor size: "
);
output
=
paddle_mobile
.
Fetch
();
os
<<
output
->
numel
()
<<
"
\n
"
<<
output
->
data
<
float
>
()[
0
];
for
(
int
i
=
1
;
i
<
output
->
numel
();
++
i
)
{
os
<<
", "
<<
output
->
data
<
float
>
()[
i
];
...
...
test/net/test_eng.cpp
浏览文件 @
9729edac
...
...
@@ -36,11 +36,11 @@ int main() {
input_tensor
.
data
<
float
>
()
+
input_tensor
.
numel
());
// 预热十次
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
paddle_mobile
.
Predict
Lod
(
input_tensor
);
paddle_mobile
.
Predict
(
input_tensor
);
}
auto
time3
=
time
();
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
paddle_mobile
.
Predict
Lod
(
input_tensor
);
paddle_mobile
.
Predict
(
input_tensor
);
}
auto
time4
=
time
();
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
<<
"ms"
...
...
test/net/test_googlenet.cpp
浏览文件 @
9729edac
...
...
@@ -41,12 +41,12 @@ int main(int argc, char* argv[]) {
#endif
paddle_mobile
.
SetThreadNum
(
thread_num
);
auto
time1
=
time
();
if
(
paddle_mobile
.
Load
(
g_googlenet
,
optimize
))
{
std
::
vector
<
float
>
output
;
if
(
paddle_mobile
.
Load
(
g_googlenet
,
optimize
,
false
,
1
,
true
))
{
auto
time2
=
paddle_mobile
::
time
();
std
::
cout
<<
"load cost :"
<<
paddle_mobile
::
time_diff
(
time1
,
time2
)
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>
input
;
std
::
vector
<
float
>
output
;
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
if
(
feed_shape
)
{
sscanf
(
feed_shape
,
"%d,%d,%d"
,
&
dims
[
1
],
&
dims
[
2
],
&
dims
[
3
]);
...
...
test/net/test_nlp.cpp
浏览文件 @
9729edac
...
...
@@ -48,8 +48,8 @@ int main() {
DLOG
<<
"words lod 22: "
<<
words
.
lod
();
auto
time3
=
time
();
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
auto
vec_result
=
paddle_mobile
.
PredictLod
(
words
);
DLOG
<<
*
vec_result
;
paddle_mobile
.
Predict
(
words
);
DLOG
<<
*
paddle_mobile
.
Fetch
()
;
}
auto
time4
=
time
();
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
/
1
<<
"ms"
...
...
@@ -84,8 +84,8 @@ int main() {
DLOG
<<
"words lod 22: "
<<
words
.
lod
();
auto
time3
=
time
();
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
auto
vec_result
=
paddle_mobile
.
PredictLod
(
words
);
DLOG
<<
*
vec_result
;
paddle_mobile
.
Predict
(
words
);
DLOG
<<
*
paddle_mobile
.
Fetch
()
;
}
auto
time4
=
time
();
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
/
1
<<
"ms"
...
...
test/net/test_ocr.cpp
0 → 100644
浏览文件 @
9729edac
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
void
load_images
(
const
char
*
image_dir
,
const
char
*
images_list
,
std
::
vector
<
std
::
string
>
*
image_names
,
std
::
vector
<
std
::
pair
<
int
,
int
>>
*
image_shapes
)
{
int
height
,
width
;
std
::
string
filename
;
std
::
ifstream
if_list
(
images_list
,
std
::
ios
::
in
);
while
(
!
if_list
.
eof
())
{
if_list
>>
height
>>
width
>>
filename
;
image_shapes
->
push_back
(
std
::
make_pair
(
height
,
width
));
image_names
->
push_back
(
filename
);
}
}
int
main
(
int
argc
,
char
**
argv
)
{
if
(
argc
<
4
)
{
std
::
cerr
<<
"Usage: ./test_ocr model_dir image_dir images_list."
<<
std
::
endl
;
return
1
;
}
char
*
model_dir
=
argv
[
1
];
char
*
image_dir
=
argv
[
2
];
char
*
images_list
=
argv
[
3
];
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
8
);
auto
isok
=
paddle_mobile
.
Load
(
std
::
string
(
model_dir
)
+
"/model"
,
std
::
string
(
model_dir
)
+
"/params"
,
true
,
false
,
1
,
true
);
DLOG
<<
"pass init model"
;
std
::
vector
<
std
::
string
>
image_names
;
std
::
vector
<
std
::
pair
<
int
,
int
>>
image_shapes
;
load_images
(
image_dir
,
images_list
,
&
image_names
,
&
image_shapes
);
DLOG
<<
"pass load images"
;
for
(
int
i
=
0
;
i
<
image_names
.
size
();
i
++
)
{
std
::
string
file_name
=
image_names
[
i
];
std
::
vector
<
float
>
input
;
std
::
vector
<
int64_t
>
dims
{
1
,
1
,
48
,
512
};
dims
[
2
]
=
image_shapes
[
i
].
first
;
dims
[
3
]
=
image_shapes
[
i
].
second
;
// load input image
std
::
string
img_path
=
std
::
string
(
image_dir
)
+
"/"
+
file_name
;
std
::
cerr
<<
"img_path: "
<<
img_path
<<
std
::
endl
;
std
::
cerr
<<
"shape = ["
<<
dims
[
0
]
<<
", "
<<
dims
[
1
]
<<
", "
<<
dims
[
2
]
<<
", "
<<
dims
[
3
]
<<
"]"
<<
std
::
endl
;
GetInput
<
float
>
(
img_path
,
&
input
,
dims
);
// predict
auto
output
=
paddle_mobile
.
Predict
(
input
,
dims
);
// print result
std
::
cerr
<<
file_name
<<
std
::
endl
;
std
::
cerr
<<
output
[
0
];
for
(
int
j
=
1
;
j
<
output
.
size
();
++
j
)
{
std
::
cerr
<<
" "
<<
output
[
j
];
}
std
::
cerr
<<
std
::
endl
;
}
return
0
;
}
tools/pre-commit.hooks/cpplint.hook
浏览文件 @
9729edac
...
...
@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for
file
in
$(
git diff
--cached
--name-status
|
awk
'$1 != "D" {print $2}'
|
\
grep
-v
".pb.cpp"
|
grep
-v
".pb.h"
|
grep
-v
".pb-c.h"
|
grep
-v
".pb-c.c"
|
\
grep
-v
"protobuf-c.h"
|
grep
-v
"protobuf-c.c"
)
;
do
grep
-v
"protobuf-c.h"
|
grep
-v
"protobuf-c.c"
|
grep
-v
"paddle_mobile_jni.cpp"
)
;
do
cpplint
$file
;
TOTAL_ERRORS
=
$(
expr
$TOTAL_ERRORS
+
$?
)
;
done
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录